<?xml version="1.0" encoding="UTF-8"?>
<TEI xml:space="preserve" xmlns="http://www.tei-c.org/ns/1.0" 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
xsi:schemaLocation="http://www.tei-c.org/ns/1.0 https://raw.githubusercontent.com/kermitt2/grobid/master/grobid-home/schemas/xsd/Grobid.xsd"
 xmlns:xlink="http://www.w3.org/1999/xlink">
	<teiHeader xml:lang="en">
		<fileDesc>
			<titleStmt>
				<title level="a" type="main">Spiking Emotions: Dynamic Vision Emotion Recognition Using Spiking Neural Networks 1</title>
			</titleStmt>
			<publicationStmt>
				<publisher/>
				<availability status="unknown"><licence/></availability>
			</publicationStmt>
			<sourceDesc>
				<biblStruct>
					<analytic>
						<author>
							<persName><forename type="first">Binqiang</forename><surname>Wang</surname></persName>
							<affiliation key="aff0">
								<orgName type="institution">Shandong Massive Information Technology Research Institute</orgName>
								<address>
									<country key="CN">China</country>
								</address>
							</affiliation>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<author role="corresp">
							<persName><forename type="first">Gang</forename><surname>Dong</surname></persName>
							<email>donggang@inspur.com</email>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Yaqian</forename><surname>Zhao</surname></persName>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Rengang</forename><surname>Li</surname></persName>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Hongbin</forename><surname>Yang</surname></persName>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Wenfeng</forename><surname>Yin</surname></persName>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<author>
							<persName><forename type="first">Lingyan</forename><surname>Liang</surname></persName>
							<affiliation key="aff1">
								<orgName type="department">State Key Laboratory of High-end Server &amp; Storage Technology Inspur</orgName>
								<orgName type="institution">Beijing) Electronic Information Industry Co. Ltd</orgName>
								<address>
									<settlement>Beijing</settlement>
									<country key="CN">China</country>
								</address>
							</affiliation>
						</author>
						<title level="a" type="main">Spiking Emotions: Dynamic Vision Emotion Recognition Using Spiking Neural Networks 1</title>
					</analytic>
					<monogr>
						<imprint>
							<date/>
						</imprint>
					</monogr>
					<idno type="MD5">FB84C86DAF266AC6FBCE3F7C35AF8B59</idno>
				</biblStruct>
			</sourceDesc>
		</fileDesc>
		<encodingDesc>
			<appInfo>
				<application version="0.7.2" ident="GROBID" when="2023-03-25T05:55+0000">
					<desc>GROBID - A machine learning software for extracting information from scholarly documents</desc>
					<ref target="https://github.com/kermitt2/grobid"/>
				</application>
			</appInfo>
		</encodingDesc>
		<profileDesc>
			<textClass>
				<keywords>
					<term>spiking neural network</term>
					<term>dynamic vision sensor</term>
					<term>emotion recognition</term>
				</keywords>
			</textClass>
			<abstract>
<div xmlns="http://www.tei-c.org/ns/1.0"><p>Emotion recognition from vision information is a significant research topic in the computer vision community. The current prevalent solution based on Artificial Neural Networks (ANNs) demonstrates high accuracy but large computation consumption. Compared with ANNs, Spiking Neural Networks (SNNs) are more biologically realistic and computationally effective. However, it still remains a great challenge to utilize SNNs to vision emotion recognition, mainly due to the lack emotional dataset of Dynamic Vision Sensor (DVS) and a properly designed SNN framework. In this paper, we present a method to generate a simulation dataset of DVS, leveraging the existed emotion recognition dataset containing video segments. Meanwhile, an SNN framework and its counterpart ANNs are adopted to complete dynamic vision emotion recognition based on the simulated DVS dataset and original frames data respectively. The proposed SNN framework consists of a feature extraction module to extract informative features based on spike-trains of input, a voting neurons group module containing two groups of emotional neurons, and an emotional mapping module to translate output spiketrains to emotion polarity labels. The results demonstrate that compared with the ANN, the proposed SNN can achieve better performance and its energy consumption is only one-quarter of the ANN's.</p></div>
			</abstract>
		</profileDesc>
	</teiHeader>
	<text xml:lang="en">
		<body>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1.">Introduction</head><p>Emotion recognition, as a hot research topic in the affective computing community, has derived many researchers' attention coming from domains like computer vision <ref type="bibr" target="#b0">[1,</ref><ref type="bibr" target="#b1">2]</ref>, natural language processing <ref type="bibr" target="#b2">[3]</ref>, speech processing <ref type="bibr" target="#b3">[4,</ref><ref type="bibr" target="#b4">5]</ref>, and human-computer interaction <ref type="bibr" target="#b5">[6,</ref><ref type="bibr" target="#b6">7]</ref>. At present, most methods adopt Artificial Neural Networks (ANNs) to perform emotion recognition, which achieves state-of-art solutions. An efficient emotion recognition method will facilitate communication between people on the wearable scenes <ref type="bibr" target="#b7">[8]</ref>. However, the high energy consumption of ANNs hinders emotion recognition's application on embedded and mobile devices. Although knowledge distillation <ref type="bibr" target="#b8">[9]</ref> and neural architecture search <ref type="bibr" target="#b9">[10]</ref> can obtain ANN architecture with fewer parameters to reduce energy consumption and be suitable for mobile devices, it does not change the essence of ANNs.</p><p>As the third generation of neural networks, Spiking Neural Network (SNN) <ref type="bibr" target="#b10">[11]</ref> with low power consumption is one potential solution to lead to an embedded and mobile emotion recognition algorithm reality. Some researches applying SNNs to complete emotion recognition tasks have been proposed to extract emotion information from speech, cross-modal, or electroencephalograph (EGG) <ref type="bibr" target="#b11">[12,</ref><ref type="bibr" target="#b12">13,</ref><ref type="bibr" target="#b13">14,</ref><ref type="bibr" target="#b14">15,</ref><ref type="bibr" target="#b15">16]</ref>. The feature extraction in most of these methods involves the pre-processing operation, the audio feature extraction such as Mel cepstrum coefficient. To complete emotion recognition, a shallow SNN, a three-layer in most existing methods is adopted as a classifier. Based on these techniques, previous methods have accomplished encouraging performance on relevant datasets. Nevertheless, it remains challenging to extract emotional representative information using SNNs from video segments. The first challenge is to collect an emotion recognition dataset utilizing a dynamic vision sensor, which is expensive to conduct. To mitigate this cost, a simulated method to generate simulated spikes-like data herein is proposed inspired by frame difference encoding in <ref type="bibr" target="#b16">[17]</ref>. Note that in order to better simulate the mechanism of human ocular nerve receiving information, a kind of float value frame is adopted in the simulated method, which is a novel scheme in the spiking encoding domain <ref type="bibr" target="#b17">[18,</ref><ref type="bibr" target="#b18">19]</ref>. On the other hand, the structures in existing SNN-based emotion recognition methods are simple and the spiking neuron model used in most previous literature is Leaky Integrate-and-Fire (LIF) model. To take full advantage of existing abstract structures in ANNs, a framework is designed to leverage the latest progress of SNN proposed in <ref type="bibr" target="#b19">[20]</ref>, where a new spiking neuron model is termed Parametric Leaky Integrate-and-Fire (PLIF) neuron model.</p><p>In this paper, we propose a scheme that is capable of combining the advantage of short-term highperformance results based on ordinary cameras and the low energy consumption of dynamic vision sensors. Thus, the simulated data contains both float-value data in the first capture of the scene and spike trains data during the remaining observation period. Experiments are designed to demonstrate the effectiveness of the proposed scheme.</p><p>Our contributions are summarized as: 1) To the best of our knowledge, this is the first attempt to apply the SNNs in emotion recognition based on simulated dynamic vision sensor data. As SNNs have higher biological plausibility compared with ANNs, the combination of SNNs and the dynamic vision sensor may be supportive to exploit the emotion possessed by humans. 2) We propose a method to generate simulated dynamic vision sensor data. Note that the generated data is not pure spikes. Considering the real application scene, the first frame data is represented by float-value and the following frames consist of pure spikes. 3) Parametric Leaky Integrate-and-Fire (PLIF) is adopted to construct the SNN in this paper. We evaluate the SNN on the simulated dynamic vision sensor data. The SNN achieves better performance compared with the counterpart ANN.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.">Method</head></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">DVS Simulation Algorithm</head><p>The simulation algorithm is explained in detail in this section. Firstly, the concept of DVS is introduced briefly. Then, the data format of the DVS is clarified. Finally, the simulation algorithm is present to generate DVS format data based on video segments.</p><p>Focusing on the dynamic information, DVS records the dynamic changes of a scene that is under perception. Different from recording the whole scene pixel by pixel with a float number representing the intensity of light in traditional cameras, DVS only captures the changes of light of the scene, the recorded contents are either 0 or 1, which indicates whether the intensity of a location in the scene has changed. It is not trivial to directly collect emotion recognition data using DVS as DVS is expensive. An alternative idea is to generate simulated dynamic vision emotion recognition data in terms of the data format and the existing vision emotion dataset.</p><p>The data generated by DVS are named neuromorphic data which is represented by E(𝑥 , 𝑦 , 𝑡 , 𝑝 ) (i=0,1,...,N-1), where 𝑥 , 𝑦 is the location where the event happened, 𝑡 is the time when the event occurred, 𝑝 is the polarity of the event.</p><p>Emotion recognition based on video segments provides a series of frames that record the change in a scene, which is publicly available <ref type="bibr" target="#b20">[21]</ref>. To simulate a DVS's output based on them, the RGB frame is converted to gray to represent the intensity of each frame. Then the difference between adjoin frames is used to generate the polarity. A hyperparameter is named sensitivity to represent the degree of the intensity change. Finally, the spiking is formed by frame series order in the original video to the simulation output. The final representation is the simulation of DVS's output based on video segments, which is compatible with the data format recorded by DVS. Note that to consider the real phenomenon: We catch a scene firstly with whole and then attend to the change. Inspired by this phenomenon, the first frame is set as float-valued gray information. In other words, the output of the algorithm includes two parts: the first frame with float-value representing the ordinary camera and other frames of spike values representing dynamic vision sensor.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Neuron Model</head><p>The principle of SNN is to mimic the cell in the brain on the micro-physiological scale. So the neuron model in SNN is different from that in traditional ANNs. Generally, the basic neuron model in ANN is the McCulloch-Pitts model, while the popular component neuron model in SNN is Leaky Integrate-and-Fire (LIF) model. The information transition in neuron cells is not just the summation of all the input coming from other neurons by synapses. Actually, as time goes on, the input is accumulated in the cell membrane to cause the increase of cell membrane potential. Once the membrane potential exceeds a certain threshold, a spike is generated, and then the potential is set to a reset value. LIF <ref type="bibr" target="#b19">[20]</ref> can capture the temporal information transmitted in SNN, which can be defined as:</p><formula xml:id="formula_0">τ ( ) = −(𝑉(𝑡) − 𝑉 ) + 𝑋(𝑡),<label>(1)</label></formula><p>where 𝑉(𝑡) is the cell membrane potential at time 𝑡, 𝑋(𝑡) denotes the inputs at time 𝑡, τ is the membrane time constant, 𝑉 is the reset value after one spike is generated. The threshold of potential can be represented by 𝑉 , the generation of one spiking at time 𝑡 can be formatted as:</p><formula xml:id="formula_1">S(𝑡) = 0, 𝑉(𝑡) &lt; 𝑉 1, 𝑉(𝑡) ≥ 𝑉 , (<label>2</label></formula><formula xml:id="formula_2">)</formula><p>where 1 is a spike and 0 means no operation. Generally, the τ in Eq. ( <ref type="formula" target="#formula_0">1</ref>) is a constant. Based on the case analysis in <ref type="bibr" target="#b19">[20]</ref>, the Parametric Leaky Integrate-and-Fire (PLIF) spiking neuron model is proposed to adjust the τ during the training phase. To incorporate the expressiveness of the novel neuron type, PLIF is adopted herein as a fundamental structure of the SNN framework for dynamic vision emotion recognition. Following the strategy in <ref type="bibr" target="#b19">[20]</ref>, the surrogate gradient method is used to make backpropagation-based learning work.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3">SNN Framework</head><p>The neuron cells in the brain are connected by synapses. High biological plausibility requires the connections of neurons in SNN to mimic the true structure in the brain. However, biological scientists are still studying the connections of the brain, and some regional structures have been used to build neural networks. The convolutional layer's design is an imitation of the GCs part in the human brain while the pooling layer achieves a similar invariant effect of CCs in V1 and V4 <ref type="bibr" target="#b21">[22]</ref>. Therefore, the convolutional layer and pooling layer are adopted to make up the SNN framework to complete dynamic vision emotion recognition.</p><p>The aim of dynamic vision emotion recognition is to analyze emotional results given a series of data recording a specific scenario. The proposed SNN Framework consists of three parts, including a feature extraction module to extract informative features from input spike-trains, a voting neuron group module to give spike-trains of different emotion neuron groups, and an emotional mapping module to convert the spike-trains to final emotion polarity results. The overview of the framework including the dataset simulation process is illustrated in Fig. <ref type="figure" target="#fig_0">1</ref>. The details of the DVS Simulation Algorithm have been introduced before and other modules will be presented below.  from the real data recorded by DVS, whose temporal resolution is high, the temporal resolution of spiketrains generated based on video frames is determined by the temporal resolution of videos. This schema of organizing data excludes the step of converting the asynchronous event stream into frames. Moreover, as mentioned before, the first frame is set to float-value which is more resembles the real application scenario. The component of FEM utilized herein is inspired by <ref type="bibr" target="#b19">[20]</ref>, where convolution, batch normalization, max-pooling, and PLIF neuron are adopted. Different from <ref type="bibr" target="#b19">[20]</ref>, the max-pooling is replaced by average pooling (AvgPool) based on the experimental results. Experimental results prove the superiority of average pooling on our dataset. This may be due to the tremendous information loss in spike-trains of max-pooling, as stated in <ref type="bibr" target="#b22">[23]</ref>.</p><p>2) Voting Neurons Group Module: To obtain the final emotion results, there need neurons to represent the corresponding emotion label. Suppose there are two emotion labels: positive emotions and negative emotions. In traditional ANNs, two different neurons are generally used directly to represent two different emotions. But in the human brain, the transmission of information is often carried out through a group of neurons. Two neuron populations composed of 10 neurons are applied to represent the final output: 10 positive voting neurons and 10 negative voting neurons. The informative features from FEM are divided into two groups of voting neurons. This module has no additional parameters, which only operated as reorganizing the spike-trains from another perspective. Thus, the final outputs of the voting module are spike-trains of 20 neurons and the length of each train is the simulating steps.</p><p>3) Emotional Mapping Module: Emotional Mapping Module is served as a classifier to conduct the final emotion recognition, specifically, to translate the output spike-trains into emotion labels. In SNN, fire rates of neurons during the simulation steps, denoted by $N$, are applied to represent the contribution to the corresponding target. One potential way is to directly define the desired spike trains for each emotion label. However, the definition is intricate and tedious <ref type="bibr" target="#b23">[24]</ref> and the measurement techniques of two spike-trains are not as mature as a measurement of two float-value vectors. Thus, the average fire rate of 10 positive voting neurons is treated as the final output of emotion recognition. It is the same for negative voting neurons. If the average fire rate of positive voting neurons is larger than that of negative voting neurons, the input scene is positive, and vice versa. To observe in spike-trains, a higher fire rate means a dense distribution of spikes during the simulation period.</p><p>The average fire rate makes the measurement of output simple and the Mean Squared Error (MSE) is used to measure the average fire rates between the ground-truth emotion labels. The surrogate gradient method is applied to update the parameters in the framework by backpropagation.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.">Experiments</head><p>In this section, we firstly introduce a popular dataset for emotion recognition based on video segments and the simulation DVS based on this dataset will be presented. Then, the experimental setting is followed to detail the SNN settings. Finally, the results of experiments will be analyzed to verify the functionality of SNN based on the simulated dataset.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Dataset</head><p>To conduct experiments with the previous model, a simulation dataset needs to be constructed first. Herein Carnegie Mellon University Multimodal Opinion Sentiment Intensity (CMU-MOSI) <ref type="bibr" target="#b20">[21]</ref>, a dataset recognized by the community and widely used, is chosen as the basis for the DVS simulation algorithm. The number of categories is mapped to two (positive and negative) based on the original label. Following the schedule mentioned before, a simulation DVS dataset can be generated. Note that the hyperparameter, $Sens$, will influence the simulation results. A larger $Sens$ will make the generated spike-trains sparser and a smaller $Sens$ will generate more spikes. Generally speaking, dense spike trains can achieve better accuracy, and sparse spike trains can potentially achieve lower energy consumption. $Sens$ is set to 0.001 to trade-off the accuracy and energy consumption.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2">Experimental setting TABLE I Emotion recognition on simulated dataset</head><p>Algorithms Explanation Accuracy PLIF-Fang <ref type="bibr" target="#b19">[20]</ref> network in <ref type="bibr" target="#b19">[20]</ref> 61.69 ANN counterpart of baseline SNN 65.17 Experiments are implemented by SpikingJelly <ref type="bibr" target="#b24">[25]</ref>, a framework for SNN. The code runs on a Linux system with four Tesla V100 graphics cards. Initialization of the weights of synapses is completed by the default method of PyTorch with a fixed random seed <ref type="bibr" target="#b25">[26]</ref>. To optimize the parameters, the stochastic gradient descent optimizer based on surrogate gradients implemented in SpikingJelly is utilized and the learning rate is 0.01. The batch size is set to 8 for all experiments. To find an appropriate time length of the input signal, we count the frame numbers of video segments and find that a large number of video segment samples are around 68. Thus, we set the time length to 68 herein. The input of neural networks is rescaled to 128×128. In order to make the comparison as fair as possible, an ANN basically parallel to the SNN structure is constructed, which is called the counterpart ANN, where the PLIF is changed to ReLU. The classifier for ANN is designed as a fully connected layer. We evaluate the performance of the SNN and its counterpart ANN on the simulation dataset.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3">Experimental Results</head><p>The performance of the testing set is summarized in Table <ref type="table">I</ref>. The first column represents the compared methods or different hyper parameter setting algorithms. The second column is the explanation of the algorithms. The last column reported the accuracy of corresponding methods. Compared with the PLIF-Fang from <ref type="bibr" target="#b19">[20]</ref>, the counterpart ANN of baseline SNN gives a performance increase of 3.48%. It can be seen that the performance of PLIF-Fang and baseline SNN is the same, which can be explained by the influence of different network structures. Although LIF is applicable to the baseline SNN here, and its performance is theoretically worse than PLIF-Fang. But the pooling method is different: the max-pooling is utilized in PLIF-Fang while average pooling is adopted in baseline SNN.</p><p>The performance difference of these two pooling methods on the simulated dataset can be seen from the algorithms of 0.4SNN and mpSNN in Table <ref type="table">I</ref>. The max-pooling will damage the performance of the model, which may be due to the loss of some local information compared with average pooling. Comparing baseline SNN to PLIF SNN, we can find that the performance of networks constructed with the PLIF neuron model is better (2.24% accuracy improvement), which is consistent with the conclusion in <ref type="bibr" target="#b19">[20]</ref>. However, a different setting about the voltage threshold is presented based on our experiments on our dataset. The default voltage threshold is 1, it is thought unnecessary to adjust the voltage threshold in <ref type="bibr" target="#b19">[20]</ref>. But in practice, we find that a relatively decreasing voltage threshold can obtain performance gain. It is shown that 0.4SNN achieves a better emotion recognition performance than its counterpart ANN. We argue that a relatively smaller voltage threshold of membrane potentials fires the neurons earlier, which causes a relative more training to reach a better performance. Finally, a pure spike-trains input setting is conducted to validate the effectiveness of the first float-value frame setting. It can be seen that compared with 0.4SNN, the pureSNN has a certain loss of performance.</p><p>To demonstrate what the training influence the SNNs' output at every simulating step (the total simulating herein is 68), experiments are conducted. The setting model using pure spike-trains as input is adopted to show the effect of training. The accuracy curve loss curve on the testing set are shown in Fig. <ref type="figure" target="#fig_3">2</ref>. In each subfigure, the accuracy curve is shown above, and the loss curve is shown below. It is shown in Fig. <ref type="figure" target="#fig_3">2</ref>(a) that the SNNs give the same output for all simulating steps before training. This is due to the random initialization of parameters and no spike is fired in the classifier layer before training. During the early phase of training, some changes can be observed in Fig. <ref type="figure" target="#fig_3">2</ref>(b) to suggest the updates of weights. After training is done, the loss in Fig. <ref type="figure" target="#fig_3">2(c</ref>) decreases as the simulating step is larger at first, but then the loss curve starts to increase a little. A possible explanation is that the following spikes make the model out a result different ground truth. The fluctuation of the accuracy curve also illustrates the influence caused by following spikes after the simulating step achieving the lowest loss. In order to illustrate the superiority of the setting that using float-value in the first frame, we visualized the loss and accuracy curve during the training process in Fig. <ref type="figure" target="#fig_3">2(d</ref>). It can be observed that compared to the pure spike-train input setting, using the float-value as the first frame in input can lead to faster loss drops at the beginning of training and make the accuracy exceed 60% earlier.  To show the membrane potentials and spikes' pattern vividly, two examples are shown in Fig. <ref type="figure" target="#fig_4">3</ref>. Note that there are ten neurons corresponding to positive and negative emotions and the output is defined as the average firing rate. For membrane potentials in Fig. <ref type="figure" target="#fig_4">3(a)</ref>, for every neuron on the y-axis, the rectangular bars in yellow represent the high potential, which is relatively easy to fire a spike. The output spike-trains corresponding to these membrane potentials are shown in Fig. <ref type="figure" target="#fig_4">3(b</ref>). Neuron indexes 0 to 9 represent the negative emotion output and neuron indexes 10 to 19 represent the negative output. We can see that the spikes present two different patterns. For the first example with the positive emotion label shown in Fig. <ref type="figure" target="#fig_4">3</ref>(b), from a visual point of view, the output spike-trains generated by the last 10 neurons are denser than the output spike-trains generated by the first 10 neurons. From the numerical analysis, the average spiking rate of the first ten neurons is 0.4925, which is smaller than that of the last ten neurons, which is 0.5075. Thus, the emotion is positive. For the second example with the negative emotion label shown in Fig. <ref type="figure" target="#fig_4">3(d)</ref>, The visual analysis is similar to the previous example. From the numerical analysis, the average spiking rate of the first ten neurons is 0.5060, which is larger than that of the last ten neurons, which is 0.4955. Thus, the emotion is negative, but the ground truth is positive.</p><p>To demonstrate the efficientness of the SNN, the energy consumption of SNN and counterpart ANN is analyzed theoretically. Note that the operations in SNN are mainly accumulation (ACC) operations in ANN are Multiply-ACcumulation (MAC) <ref type="bibr" target="#b26">[27]</ref>. It has been shown that a 32-bit floatingpoint MAC operation consumes 4.6 pJ while an ACC operation consumes 0.9 pJ in 45nm 0.9V chip <ref type="bibr" target="#b27">[28]</ref>. The total related information is reported in Table <ref type="table" target="#tab_0">II</ref>. As the structures of SNN and counterpart ANN are most the same, the number differences of MAC and ACC are caused by the input. The input frame's size is 128×128 and the filter kernel size is 3×3. For ANN with 68 channel inputs, the operation number of MAC is 310.9M. For SNN with 2 channel (positive events and negative events) input, the operation number of ACC is 9.1M. It should be noted that the operation of the first layer will be changed from ACC to MAC under the SNN with first frame float-values. The other part of the calculation of the same structure is 47.9M. The total consumption of ANN is 4.6× (47.9+310.9)=1.65mJ. For SNN, the first round is computed separately and the total consumption is (9.1 × 4.6)+9.1 × (N-1) × 0.9+47.9×N×0.9=(51.3×N+33.67)× 10 mJ. Consequently, it can be calculated the consumption of SNN is smaller than counterpart ANN when N&lt;32. We can see from Fig. <ref type="figure" target="#fig_3">2</ref>, the performance is almost stable when simulating step at around 30. A critical point is that when SNN is implemented by neuromorphic hardware <ref type="bibr" target="#b28">[29,</ref><ref type="bibr" target="#b29">30]</ref>, the computation of SNN will exclusively happen when there is a spike. By counting the proportion of the number of spikes in the dataset to the total frame data, it is found that only 10.79% of positions occur spike events. In other words, the potential effectiveness advantage of SNN is larger than the above-mentioned. Thus, the total consumption of SNN with 68 simulating steps is about 0.41mJ. Compared with ANN's energy consumption, SNN's energy consumption is reduced by three quarters.</p></div>
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.">Conclusion</head><p>In this paper, we have proposed a simulated method to generate DVS-like data based on video segments and an SNN framework considering the real application scene to complete recognition. Inspired by the float input in ANN, the first frame of input to SNN is changed from spikes to float-value. The proposed SNN framework presents a feature extraction module for informative spike patterns from simulated input spike-trains and employs a voting neurons group module and emotion mapping module to convert output spike-trains to the final emotion labels. In addition, in our dataset, the theoretical energy consumption of SNN is only a quarter of that of ANN. An interesting future direction is to further explore the topology of other potential structures for SNN.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>1 )</head><label>1</label><figDesc>Feature Extraction Module: Feature Extraction Module (FEM) is utilized to extract informative features from input spike-trains. The original frames of videos are encoded into spike-trains. Different</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 1 .</head><label>1</label><figDesc>Figure 1. The framework of the emotion recognition</figDesc><graphic coords="3,201.60,519.48,192.18,99.24" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head></head><label></label><figDesc>change the voltage threshold of PLIF SNN 65.42 mpSNN change to max-pooling of 0.4SNN 63.43 pureSNN omit the first float-valued frame of 0.4SNN 65.17</figDesc></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 2 .</head><label>2</label><figDesc>Figure 2. Curves of accuracy (above in each subfigure) and loss (bottom in each subfigure) on testing set: The x-axis represents different simulating steps, and the y-axis represents the testing accuracy and loss. (a) Before train with pure spike-trains as input. (b) During train with pure spike-trains as input. (c) After train with pure spike-trains as input. (d) After train with float-value as first frame.</figDesc><graphic coords="5,201.60,196.80,192.24,144.72" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 3 .</head><label>3</label><figDesc>Figure 3. Examples to show the output spike potentials and output spike-trains: The x-axis represents different simulating steps, and the y-axis represents the membrane potential and output spike-trains in the voting neurons group module. Output spike potentials of correct sample (a) and wrong example (c). Output spike-trains of correct sample (b) and wrong sample (d).</figDesc><graphic coords="6,201.60,356.28,192.18,146.52" type="bitmap" /></figure>
<figure xmlns="http://www.tei-c.org/ns/1.0" type="table" xml:id="tab_0"><head>TABLE II</head><label>II</label><figDesc>The number of operations and energy consumption of ANN and SNN on the generated dataset. # denotes the number of operations. M means million.</figDesc><table><row><cell></cell><cell>#First layer</cell><cell>#Other Layers</cell><cell>Energy consumption per operation</cell><cell>Total consumption</cell><cell>Total consumption on the dataset</cell></row><row><cell>ANN</cell><cell>310.9M</cell><cell>47.9M</cell><cell>4.6pJ</cell><cell>1.65mJ</cell><cell>1.65mJ</cell></row><row><cell>SNN</cell><cell>9.1M</cell><cell>47.9M</cell><cell>0.9pJ</cell><cell>1.57mJ(N=30)</cell><cell>0.41mJ</cell></row></table></figure>
		</body>
		<back>

			<div type="acknowledgement">
<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5.">Acknowledgements</head><p>This work was supported by the Natural Science Foundation of Shandong Province (No. ZR2021QF145)</p></div>
			</div>

			<div type="references">

				<listBibl>

<biblStruct xml:id="b0">
	<analytic>
		<title level="a" type="main">Learning non-local spatial correlations to restore sparse 3d single-photon data</title>
		<author>
			<persName><forename type="first">S</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Halimi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Ren</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Mccarthy</surname></persName>
		</author>
		<author>
			<persName><surname>Buller</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Image Processing</title>
		<imprint>
			<biblScope unit="issue">99</biblScope>
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b1">
	<analytic>
		<title level="a" type="main">Multi-modal emotion recognition by fusing correlation features of speechvisual</title>
		<author>
			<persName><forename type="first">G</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zeng</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Signal Processing Letters</title>
		<imprint>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="page" from="533" to="537" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b2">
	<analytic>
		<title level="a" type="main">A sentiment similarity-oriented attention model with multi-task learning for text-based emotion recognition</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Guo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Dang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on Multimedia Modeling</title>
				<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="278" to="289" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b3">
	<analytic>
		<title level="a" type="main">E-ffective: A visual analytic system for exploring the emotion and effectiveness of inspirational speeches</title>
		<author>
			<persName><forename type="first">K</forename><surname>Maher</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Deng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Lai</surname></persName>
		</author>
		<author>
			<persName><forename type="first">C</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Wang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Visualization and Computer Graphics</title>
		<imprint>
			<biblScope unit="volume">28</biblScope>
			<biblScope unit="issue">1</biblScope>
			<biblScope unit="page" from="508" to="517" />
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b4">
	<analytic>
		<title level="a" type="main">Multimodal cross-and self-attention network for speech emotion recognition</title>
		<author>
			<persName><forename type="first">L</forename><surname>Sun</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Tao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Lian</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">ICASSP 2021-2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="4275" to="4279" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b5">
	<analytic>
		<title level="a" type="main">Non-uniform attention network for multi-modal sentiment analysis</title>
		<author>
			<persName><forename type="first">B</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Dong</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Li</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chao</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Conference on Multimedia Modeling</title>
				<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2022">2022</date>
			<biblScope unit="page" from="612" to="623" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b6">
	<analytic>
		<title level="a" type="main">Cped: A chinese positive emotion database for emotion elicitation and analysis</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Shu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Ge</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Sun</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Affective Computing</title>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b7">
	<analytic>
		<title level="a" type="main">Emotion recognition from spatiotemporal eeg representations with hybrid convolutional recurrent neural networks via wearable multi-channel headset</title>
		<author>
			<persName><forename type="first">J</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Zhang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Computer Communications</title>
		<imprint>
			<biblScope unit="volume">154</biblScope>
			<biblScope unit="page" from="58" to="65" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b8">
	<analytic>
		<title level="a" type="main">Knowledge distillation and student-teacher learning for visual intelligence: A review and new outlooks</title>
		<author>
			<persName><forename type="first">L</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">K</forename><surname>Yoon</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Pattern Analysis and Machine Intelligence</title>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b9">
	<analytic>
		<title level="a" type="main">Neural architecture search: A survey</title>
		<author>
			<persName><forename type="first">T</forename><surname>Elsken</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Metzen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Hutter</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">The Journal of Machine Learning Research</title>
		<imprint>
			<biblScope unit="volume">20</biblScope>
			<biblScope unit="issue">1</biblScope>
			<biblScope unit="page" from="1997" to="2017" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b10">
	<analytic>
		<title level="a" type="main">Networks of spiking neurons: the third generation of neural network models</title>
		<author>
			<persName><forename type="first">W</forename><surname>Maass</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Neural networks</title>
		<imprint>
			<biblScope unit="volume">10</biblScope>
			<biblScope unit="issue">9</biblScope>
			<biblScope unit="page" from="1659" to="1671" />
			<date type="published" when="1997">1997</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b11">
	<analytic>
		<title level="a" type="main">Investigating multisensory integration in emotion recognition through bioinspired computational models</title>
		<author>
			<persName><forename type="first">E</forename><surname>Benssassi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ye</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Affective Computing</title>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b12">
	<analytic>
		<title level="a" type="main">Speech emotion recognition using spiking neural networks</title>
		<author>
			<persName><forename type="first">C</forename><surname>Buscicchio</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Górecki</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Caponetti</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">International Symposium on Methodologies for Intelligent Systems</title>
				<imprint>
			<publisher>Springer</publisher>
			<date type="published" when="2006">2006</date>
			<biblScope unit="page" from="38" to="46" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b13">
	<analytic>
		<title level="a" type="main">Biologically inspired speech emotion recognition</title>
		<author>
			<persName><forename type="first">R</forename><surname>Lotfidereshgi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">P</forename><surname>Gournay</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2017 IEEE international conference on acoustics, speech and signal processing (ICASSP)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2017">2017</date>
			<biblScope unit="page" from="5135" to="5139" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b14">
	<analytic>
		<title level="a" type="main">Eeg-based emotion classification using spiking neural networks</title>
		<author>
			<persName><forename type="first">Y</forename><surname>Luo</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Fu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Xie</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Qin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Jiang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Cao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Ding</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Access</title>
		<imprint>
			<biblScope unit="volume">8</biblScope>
			<biblScope unit="page" from="46007" to="46016" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b15">
	<analytic>
		<title level="a" type="main">Speech emotion recognition with early visual cross-modal enhancement using spiking neural networks</title>
		<author>
			<persName><forename type="first">E</forename><surname>Benssassi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ye</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2019 International Joint Conference on Neural Networks (IJCNN)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2019">2019</date>
			<biblScope unit="page" from="1" to="8" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b16">
	<analytic>
		<title level="a" type="main">A qvga 143 db dynamic range frame-free pwm image sensor with lossless pixel-level video compression and time-domain cds</title>
		<author>
			<persName><forename type="first">C</forename><surname>Posch</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Matolin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Wohlgenannt</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Journal of Solid-State Circuits</title>
		<imprint>
			<biblScope unit="volume">46</biblScope>
			<biblScope unit="issue">1</biblScope>
			<biblScope unit="page" from="259" to="275" />
			<date type="published" when="2010">2010</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b17">
	<analytic>
		<title level="a" type="main">Unsupervised learning of digit recognition using spike-timing-dependent plasticity</title>
		<author>
			<persName><forename type="first">P</forename><surname>Diehl</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Cook</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Frontiers in Computational Neuroscience</title>
		<imprint>
			<biblScope unit="volume">9</biblScope>
			<biblScope unit="page">99</biblScope>
			<date type="published" when="2015">2015</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b18">
	<analytic>
		<title level="a" type="main">A biologically plausible speech recognition framework based on spiking neural networks</title>
		<author>
			<persName><forename type="first">J</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chua</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Li</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">2018 International Joint Conference on Neural Networks (IJCNN)</title>
				<imprint>
			<date type="published" when="2018">2018</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b19">
	<analytic>
		<title level="a" type="main">Incorporating learnable membrane time constant to enhance learning of spiking neural networks</title>
		<author>
			<persName><forename type="first">W</forename><surname>Fang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Masquelier</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Huang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Tian</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">Proceedings of the IEEE/CVF International Conference on Computer Vision</title>
				<meeting>the IEEE/CVF International Conference on Computer Vision</meeting>
		<imprint>
			<date type="published" when="2021">2021</date>
			<biblScope unit="page" from="2661" to="2671" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b20">
	<monogr>
		<title level="m" type="main">Mosi: multimodal corpus of sentiment intensity and subjectivity analysis in online opinion videos</title>
		<author>
			<persName><forename type="first">A</forename><surname>Zadeh</surname></persName>
		</author>
		<author>
			<persName><forename type="first">R</forename><surname>Zellers</surname></persName>
		</author>
		<author>
			<persName><forename type="first">E</forename><surname>Pincus</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Morency</surname></persName>
		</author>
		<idno type="arXiv">arXiv:1606.06259</idno>
		<imprint>
			<date type="published" when="2016">2016</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b21">
	<analytic>
		<title level="a" type="main">Csnn: An augmented spiking based framework with perceptron-inception</title>
		<author>
			<persName><forename type="first">Q</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Qi</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Shen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Tang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Pan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IJCAI</title>
				<imprint>
			<date type="published" when="2018">2018</date>
			<biblScope unit="page" from="1646" to="1652" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b22">
	<analytic>
		<title level="a" type="main">Lisnn: Improving spiking neural networks with lateral interactions for robust object recognition</title>
		<author>
			<persName><forename type="first">X</forename><surname>Cheng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Hao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">B</forename><surname>Xu</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IJCAI</title>
				<imprint>
			<date type="published" when="2020">2020</date>
			<biblScope unit="page" from="1519" to="1525" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b23">
	<analytic>
		<title level="a" type="main">Unsupervised aer object recognition based on multiscale spatio-temporal features and spiking neurons</title>
		<author>
			<persName><forename type="first">Q</forename><surname>Liu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Pan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Ruan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Xing</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Tang</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">IEEE Transactions on Neural Networks and Learning Systems</title>
		<imprint>
			<biblScope unit="issue">99</biblScope>
			<biblScope unit="page" from="1" to="12" />
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b24">
	<monogr>
		<author>
			<persName><forename type="first">W</forename><surname>Fang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Ding</surname></persName>
		</author>
		<author>
			<persName><forename type="first">D</forename><surname>Chen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Yu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">H</forename><surname>Zhou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Tian</surname></persName>
		</author>
		<ptr target="https://github.com/fangwei123456/spikingjelly" />
		<title level="m">and other contributors</title>
				<imprint>
			<date type="published" when="2020">2020</date>
		</imprint>
	</monogr>
	<note>Spikingjelly</note>
</biblStruct>

<biblStruct xml:id="b25">
	<analytic>
		<title level="a" type="main">Pytorch: An imperative style, high-performance deep learning library</title>
		<author>
			<persName><forename type="first">A</forename><surname>Paszke</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Gross</surname></persName>
		</author>
		<author>
			<persName><forename type="first">F</forename><surname>Massa</surname></persName>
		</author>
		<author>
			<persName><forename type="first">A</forename><surname>Lerer</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Bradbury</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Chanan</surname></persName>
		</author>
		<author>
			<persName><forename type="first">T</forename><surname>Killeen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Lin</surname></persName>
		</author>
		<author>
			<persName><forename type="first">N</forename><surname>Gimelshein</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Antiga</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Advances in neural information processing systems</title>
		<imprint>
			<biblScope unit="volume">32</biblScope>
			<biblScope unit="page" from="8026" to="8037" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b26">
	<monogr>
		<title level="m" type="main">A fully spiking hybrid neural network for energyefficient object detection</title>
		<author>
			<persName><forename type="first">B</forename><surname>Chakraborty</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>She</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Mukhopadhyay</surname></persName>
		</author>
		<idno type="arXiv">arXiv:2104.10719</idno>
		<imprint>
			<date type="published" when="2021">2021</date>
		</imprint>
	</monogr>
	<note type="report_type">arXiv preprint</note>
</biblStruct>

<biblStruct xml:id="b27">
	<analytic>
		<title level="a" type="main">1.1 computing&apos;s energy problem (and what we can do about it)</title>
		<author>
			<persName><forename type="first">M</forename><surname>Horowitz</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="m">IEEE International Solid-State Circuits Conference Digest of Technical Papers (ISSCC)</title>
				<imprint>
			<publisher>IEEE</publisher>
			<date type="published" when="2014">2014. 2014</date>
			<biblScope unit="page" from="10" to="14" />
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b28">
	<analytic>
		<title level="a" type="main">Darwin: A neuromorphic hardware co-processor based on spiking neural networks</title>
		<author>
			<persName><forename type="first">D</forename><surname>Ma</surname></persName>
		</author>
		<author>
			<persName><forename type="first">J</forename><surname>Shen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Gu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Zhu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">X</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Q</forename><surname>Xu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Shen</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Pan</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Journal of Systems Architecture</title>
		<imprint>
			<biblScope unit="volume">77</biblScope>
			<biblScope unit="page" from="43" to="51" />
			<date type="published" when="2017">2017</date>
		</imprint>
	</monogr>
</biblStruct>

<biblStruct xml:id="b29">
	<analytic>
		<title level="a" type="main">Towards artificial general intelligence with hybrid tianjic chip architecture</title>
		<author>
			<persName><forename type="first">J</forename><surname>Pei</surname></persName>
		</author>
		<author>
			<persName><forename type="first">L</forename><surname>Deng</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Song</surname></persName>
		</author>
		<author>
			<persName><forename type="first">M</forename><surname>Zhao</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Y</forename><surname>Zhang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">S</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">G</forename><surname>Wang</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Zou</surname></persName>
		</author>
		<author>
			<persName><forename type="first">Z</forename><surname>Wu</surname></persName>
		</author>
		<author>
			<persName><forename type="first">W</forename><surname>He</surname></persName>
		</author>
	</analytic>
	<monogr>
		<title level="j">Nature</title>
		<imprint>
			<biblScope unit="volume">572</biblScope>
			<biblScope unit="issue">7767</biblScope>
			<biblScope unit="page" from="106" to="111" />
			<date type="published" when="2019">2019</date>
		</imprint>
	</monogr>
</biblStruct>

				</listBibl>
			</div>
		</back>
	</text>
</TEI>
