<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v23i10e25460</article-id>
      <article-id pub-id-type="pmid">34709193</article-id>
      <article-id pub-id-type="doi">10.2196/25460</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Improved Environment-Aware–Based Noise Reduction System for Cochlear Implant Users Based on a Knowledge Transfer Approach: Development and Usability Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Kukafka</surname>
            <given-names>Rita</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chu</surname>
            <given-names>Yuan-Chia</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Tang</surname>
            <given-names>Shih-Tsang</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Lieber Po-Hung</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4799-4105</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Han</surname>
            <given-names>Ji-Yan</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9208-8502</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>Wei-Zhong</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4993-2541</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Ren-Jie</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1386-5452</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Lai</surname>
            <given-names>Ying-Hui</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <address>
            <institution>Department of Biomedical Engineering</institution>
            <institution>National Yang Ming Chiao Tung University</institution>
            <addr-line>No 155, Sec 2, Linong Street</addr-line>
            <addr-line>Taipei, 112</addr-line>
            <country>Taiwan</country>
            <fax>886 228210847</fax>
            <phone>886 228267021</phone>
            <email>yh.lai@nycu.edu.tw</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4120-7289</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Otolaryngology</institution>
        <institution>Cheng Hsin General Hospital</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Faculty of Medicine, Institute of Brain Science</institution>
        <institution>National Yang Ming Chiao Tung University</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Medical Research</institution>
        <institution>China Medical University Hospital</institution>
        <institution>China Medical University</institution>
        <addr-line>Taichung</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Speech Language Pathology and Audiology, College of Health Technology</institution>
        <institution>National Taipei University of Nursing and Health Sciences</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Biomedical Engineering</institution>
        <institution>National Yang Ming Chiao Tung University</institution>
        <addr-line>Taipei</addr-line>
        <country>Taiwan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Ying-Hui Lai <email>yh.lai@nycu.edu.tw</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <month>10</month>
        <year>2021</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>28</day>
        <month>10</month>
        <year>2021</year>
      </pub-date>
      <volume>23</volume>
      <issue>10</issue>
      <elocation-id>e25460</elocation-id>
      <history>
        <date date-type="received">
          <day>9</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-request">
          <day>30</day>
          <month>11</month>
          <year>2020</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>2</month>
          <year>2021</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>4</month>
          <year>2021</year>
        </date>
      </history>
      <copyright-statement>©Lieber Po-Hung Li, Ji-Yan Han, Wei-Zhong Zheng, Ren-Jie Huang, Ying-Hui Lai. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 28.10.2021.</copyright-statement>
      <copyright-year>2021</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2021/10/e25460" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Cochlear implant technology is a well-known approach to help deaf individuals hear speech again and can improve speech intelligibility in quiet conditions; however, it still has room for improvement in noisy conditions. More recently, it has been proven that deep learning–based noise reduction, such as noise classification and deep denoising autoencoder (NC+DDAE), can benefit the intelligibility performance of patients with cochlear implants compared to classical noise reduction algorithms.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>Following the successful implementation of the NC+DDAE model in our previous study, this study aimed to propose an advanced noise reduction system using knowledge transfer technology, called NC+DDAE_T; examine the proposed NC+DDAE_T noise reduction system using objective evaluations and subjective listening tests; and investigate which layer substitution of the knowledge transfer technology in the NC+DDAE_T noise reduction system provides the best outcome.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The knowledge transfer technology was adopted to reduce the number of parameters of the NC+DDAE_T compared with the NC+DDAE. We investigated which layer should be substituted using short-time objective intelligibility and perceptual evaluation of speech quality scores as well as <italic>t</italic>-distributed stochastic neighbor embedding to visualize the features in each model layer. Moreover, we enrolled 10 cochlear implant users for listening tests to evaluate the benefits of the newly developed NC+DDAE_T.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The experimental results showed that substituting the middle layer (ie, the second layer in this study) of the noise-independent DDAE (NI-DDAE) model achieved the best performance gain regarding short-time objective intelligibility and perceptual evaluation of speech quality scores. Therefore, the parameters of layer 3 in the NI-DDAE were chosen to be replaced, thereby establishing the NC+DDAE_T. Both objective and listening test results showed that the proposed NC+DDAE_T noise reduction system achieved similar performances compared with the previous NC+DDAE in several noisy test conditions. However, the proposed NC+DDAE_T only required a quarter of the number of parameters compared to the NC+DDAE.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>This study demonstrated that knowledge transfer technology can help reduce the number of parameters in an NC+DDAE while keeping similar performance rates. This suggests that the proposed NC+DDAE_T model may reduce the implementation costs of this noise reduction system and provide more benefits for cochlear implant users.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>cochlear implants</kwd>
        <kwd>noise reduction</kwd>
        <kwd>deep learning</kwd>
        <kwd>noise classification</kwd>
        <kwd>hearing</kwd>
        <kwd>deaf</kwd>
        <kwd>sound</kwd>
        <kwd>audio</kwd>
        <kwd>cochlear</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Cochlear implants (CIs) are implanted electronic medical devices that can enable patients with profound-to-severe hearing loss to obtain a sense of sound. In their study, Gifford et al [<xref ref-type="bibr" rid="ref1">1</xref>] showed that 28% of individuals equipped with CI achieved 100% speech intelligibility. Sladen et al [<xref ref-type="bibr" rid="ref2">2</xref>] also reported similar results in their study: after undergoing CI implantation, the word accuracy of CI users was 80% in a quiet environment. Although CI users have few obstacles in a quiet environment, there is still scope for improvement in a noisy environment [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>Noise reduction (NR) is one of classical methods to alleviate the effect of background noise for CI users. Over the past few decades, many statistical signal processing NR methods have been proposed, such as log minimum mean squared error [<xref ref-type="bibr" rid="ref3">3</xref>], Karhunen-Loéve transform [<xref ref-type="bibr" rid="ref4">4</xref>], Wiener filter-based on a priori signal-to-noise ratio (SNR) estimation [<xref ref-type="bibr" rid="ref5">5</xref>], generalized maximum a posteriori spectral amplitude [<xref ref-type="bibr" rid="ref6">6</xref>], and SNR-based [<xref ref-type="bibr" rid="ref7">7</xref>] approaches. Loizou et al [<xref ref-type="bibr" rid="ref8">8</xref>] proposed a single-channel algorithm to conduct NR, and the results showed that the sentence recognition scores in 14 participants with CI improved significantly over their daily performances. Dawson et al [<xref ref-type="bibr" rid="ref7">7</xref>] evaluated a real-time NR algorithm which used the noise estimation to pick up 1 NR approach out of 2 different levels of NR approaches according to the SNR. The study results showed that the proposed NR algorithm could benefit CI users in speech a reception threshold under 3 kinds of noise. Mauger et al [<xref ref-type="bibr" rid="ref9">9</xref>] optimized the gain function to achieve a better SNR-based NR, and the results showed that with the optimized gain function, a 27% improvement was achieved for CI users in speech-weighted noise. Although classical NR function can improve speech intelligibility for CI users in stationary noise conditions [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], improvements are still needed in nonstationary noise conditions [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
      <p>Deep learning (DL)–based NR methods have recently shown better performance than classical statistical-based NR methods [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. Lai et al [<xref ref-type="bibr" rid="ref18">18</xref>] used a deep denoising autoencoder (DDAE)–based NR using vocoder simulation to perform NR function for CI users; the listening test showed that the speech intelligibility was better with DDAE-based NR than with convectional single-microphone NR approaches, whether in stationary or nonstationary noise conditions. Goehring et al [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>] used neural and recurrent neural networks to perform the NR function for CI users, and the results showed that the proposed NR function could significantly improve speech intelligibility in babbling noise conditions. In DL methods, the nonstationary noise can be processed well, but this needs a huge amount of training data in different noise types and SNR levels. However, when a mismatch exists, such as when there is a difference in data between the training and testing phase, the performance of the DL method is usually degraded [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p>
      <p>An environment-aware–based NR system called noise classifier (NC) +DDAE (NC+DDAE) was proposed to alleviate the above issue [<xref ref-type="bibr" rid="ref21">21</xref>]. The NC+DDAE NR system combines <italic>n</italic>-specific noise-dependent (ND)-DDAE NR models and a noise-independent (NI)-DDAE NR model. The NC function (ie, deep neural network model) was used to distinguish <italic>n</italic> different typical noises and select a suitable DDAE model to perform the NR function for CI users. Hence, the NC function made the NC+DDAE an environment-aware–based NR system. The objective measures and listening test showed that the NC+DDAE model had a much higher performance than did the other NR methods. Although the NC+DDAE model has proven to benefit the CI user and have the flexibility of customization, the NC+DDAE model requires several parameters, which increase the requirements for device implementation. Therefore, the NC+DDAE model needs to be modified to have fewer requirements while maintaining the performance at the same level.</p>
      <p>Recently, the knowledge transfer (so called transfer learning) approach [<xref ref-type="bibr" rid="ref22">22</xref>] has been used in many speech signal processing tasks (eg, speech emotion detection [<xref ref-type="bibr" rid="ref23">23</xref>], text-to-speech system [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], and speech enhancement [<xref ref-type="bibr" rid="ref26">26</xref>]) and has proven to provide benefits for the DL-based model. Knowledge transfer is a machine learning method developed for a specific task that reuses the initial parameters for a new model for the target task. In other words, the knowledge transfer technology transfers the domain knowledge based on the source domain to the target domain to help the DL-based model achieve better performance; furthermore, it can speed up the time needed to develop and train a model by reusing these pieces or modules that have already been developed [<xref ref-type="bibr" rid="ref22">22</xref>]. Following the concept of knowledge transfer technology, we proposed an improved NC+DDAE NR model, called NC+DDAE_transfer (NC+DDAE_T). We first analyzed the differences between features in each layer of DDAE to choose the most suitable layer for NR adaptation. Next, we compared the performance between NC+DDAE and NC+DDAE_T with 2 well-known objective metrics: perceptual evaluation of speech quality (PESQ) [<xref ref-type="bibr" rid="ref27">27</xref>] and short-time objective intelligibility (STOI) [<xref ref-type="bibr" rid="ref28">28</xref>]. The PESQ shows the result of comparing the clean and processed speech by mean opinion score. In the mean opinion score, 5 is the highest score while 1 is the lowest. According to a previous study [<xref ref-type="bibr" rid="ref27">27</xref>], a score over 4 is high enough for most people to listen comfortably and a score of 3.6 is an acceptable boundary for those with normal hearing. The STOI represents the speech intelligibility by a correlation coefficient derived from comparing the energy of clean and processed speech in each frame. STOI ranges from 0 to 1, with a higher score representing more clear and understandable speech. Finally, the clinical effectiveness of NC+DDAE_T with the NC+DDAE and DDAE NR systems for patients with CI was evaluated in noisy listening conditions.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>In this section, we describe first the NC+DDAE approach. We then introduce the NC+DDAE_T method, the transfer learning–based NC+DDAE NR modified in this study. Finally, we describe the experimental setting and material to prove the benefits of the proposed NC+DDAE_T compared to 2 well-known DL-based NR systems (ie, DDAE and NC+DDAE).</p>
      <sec>
        <title>NR Based on the NC+DDAE Approach</title>
        <p><xref rid="figure1" ref-type="fig">Figure 1</xref> shows the proposed NC+DDAE model in our previous study [<xref ref-type="bibr" rid="ref21">21</xref>], where 2 critical units, NC and DDAE, were included. In this approach, first, the noisy speech signals <bold><italic>y(t)</italic></bold> are processed by feature extraction units to obtain <bold><italic>Y<sub>j</sub><sup>MFCC</sup></italic></bold> and <bold><italic>Y<sub>j</sub><sup>LPS</sup></italic></bold>, which denote log power spectra (LPS) [<xref ref-type="bibr" rid="ref29">29</xref>] and Mel-frequency cepstral coefficients [<xref ref-type="bibr" rid="ref30">30</xref>], respectively, with <italic>j</italic> denoting the frame in the short-time Fourier transform. <bold><italic>Y<sub>j</sub><sup>MFCC</sup></italic></bold> is the input of the NC model to determine the current type of background noise and to select a suitable DDAE model for NR, which includes multiple ND-DDAE models each trained by a model-specific noise type and a single NI-DDAE model trained by 120 noise types [<xref ref-type="bibr" rid="ref15">15</xref>]. When the noisy input signal is similar to one of the specific noise types, the specific ND-DDAE model is chosen for NR; otherwise, the NI-DDAE is used. Afterward, the selected DDAE model processes <bold><italic>Y<sub>j</sub><sup>LPS</sup></italic></bold> to obtain the enhanced features. <inline-graphic xlink:href="jmir_v23i10e25460_fig9.png" xlink:type="simple" mimetype="image"/> is combined with the noisy phase <bold><italic>Y<sup>phase</sup></italic></bold> to finally reconstruct the enhanced speech <inline-graphic xlink:href="jmir_v23i10e25460_fig10.png" xlink:type="simple" mimetype="image"/>. The NC+DDAE NR system has been defined in detail previously [<xref ref-type="bibr" rid="ref21">21</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Structure of the noise classifier with a deep denoising autoencoder (NC+DDAE) system. DDAE: deep denoising autoencoder; FFT: fast Fourier transform; IFFT: inverse fast Fourier transform; LPS: log power spectra; NC: noise classifier; ND: noise-dependent; NI: noise-independent; MFCC: Mel-frequency cepstral coefficient.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>NR With the Proposed NC+DDAE_T Approach</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows the pipeline of the NC+DDAE_T NR approach proposed in this study. The signal processing procedure of the NC+DDAE_T is similar to that of the above-mentioned NC+DDAE. The major difference lies in the NR model as described in the following sections.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Structure of the proposed noise classifier system with DDAE and knowledge transfer. DDAE: deep denoising autoencoder; DNN: deep neural network; FFT: fast Fourier transform; IFFT: inverse fast Fourier transform; LPS: log power spectra; NC: noise classifier; NI: noise-independent;  MFCC: Mel-frequency cepstral coefficient.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <sec>
          <title>NC Model</title>
          <p>The NC model of the proposed NC+DDAE_T is the same as that in our previously described system. Initially, the system receives a noisy speech <bold><italic>y(t)</italic></bold> and computes the <bold><italic>Y<sub>j</sub><sup>MFCC</sup></italic></bold> and <bold><italic>Y<sub>j</sub><sup>LPS</sup></italic></bold> features separately. <bold><italic>Y<sub>j</sub><sup>MFCC</sup></italic></bold> is then sent to the NC model. The NC model is a deep neural network (DNN) composed of 3 hidden layers. Each layer consists of 100 neurons and an output layer adapting the softmax function [<xref ref-type="bibr" rid="ref30">30</xref>]. The output at the <italic>j-</italic>th node of the <italic>l-</italic>th layer in a DNN h<italic><sub>j</sub></italic><sup>(</sup><italic><sup>l</sup></italic><sup>)</sup> is produced according to equation 1:</p>
          <p><graphic xlink:href="jmir_v23i10e25460_fig11.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (<bold>1</bold>)</p>
          <p>where the term <italic>h<sub>j</sub><sup>(l–1)</sup></italic> denotes the output from the <italic>i</italic>-th node in the (<italic>l</italic>−1)-th layer, <italic>b<sub>j</sub><sup>(l)</sup></italic> is the bias of index <italic>j</italic>, and <italic>W<sub>ij</sub><sup>l</sup></italic> is the weight between hidden unit <italic>j</italic> and <italic>i</italic>. σ<italic>(∙)</italic> is the activation function [<xref ref-type="bibr" rid="ref30">30</xref>], which is the logistic function described in equation 2:</p>
          <p><graphic xlink:href="jmir_v23i10e25460_fig12.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (<bold>2</bold>)</p>
          <p>Next, the trained DNN model is used in the NC function. The output of the last layer is converted into the probability by the softmax function [<xref ref-type="bibr" rid="ref31">31</xref>] to obtain the normalized probability-based output. The back propagation algorithm [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>] is then applied to parameter set θ in equation 3, where <italic>L</italic>(∙) is the loss function, <italic>N<sub>i</sub></italic> denotes the correct noise class, and <inline-graphic xlink:href="jmir_v23i10e25460_fig13.png" xlink:type="simple" mimetype="image"/> is the output class of the DNN-based NC.</p>
          <p><graphic xlink:href="jmir_v23i10e25460_fig15.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (<bold>3</bold>)</p>
          <p>To avoid substantial variance in the DNN output, we use the confidence measurement [<xref ref-type="bibr" rid="ref34">34</xref>] to analyze the output of the DNN-based NC. Based on the confidence measurement score, a threshold is used to determine the classification results. In other words, when the confidence measurement score is higher than the threshold, the result predicted by the NC model is considered trustworthy. Nevertheless, if the confidence measurement score is not concrete to one noise type, then the NI-DDAE is chosen for NR; on the other hand, if the confidence measurement is solid, the ND-DDAE is selected.</p>
        </sec>
        <sec>
          <title>DDAE-based NR Model</title>
          <p>In the training phase, the noisy LPS feature <bold><italic>Y<sub>j</sub><sup>LPS</sup></italic></bold> and clean LPS feature <bold><italic>X<sub>j</sub><sup>LPS</sup></italic></bold> are the input and output, respectively, of the DDAE–based NR model. The details for training the DDAE NR model with <italic>L</italic> hidden layers mapping <bold><italic>Y<sub>j</sub><sup>LPS</sup></italic></bold> to <bold><italic>X<sub>j</sub><sup>LPS</sup></italic></bold> are available elsewhere [<xref ref-type="bibr" rid="ref21">21</xref>]. The difference between NC+DDAE and NC+DDAE_T is that only the parameters of a specific layer (ie, <italic>w<sup>L-r</sup></italic> and <italic>b<sup>L-r</sup></italic>) are trainable as shown in equation 4, whereas the other parameters remain untrainable in the fine-tuning process. The constant <italic>L</italic> denotes the number of layers, and we used 5 layers (ie, <italic>L</italic>=5) in this study.</p>
          <p><graphic xlink:href="jmir_v23i10e25460_fig16.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (<bold>4</bold>)</p>
          <p>where {<italic>W</italic><sup>1</sup>…<italic>W</italic><sup>(L-r)</sup>…<italic>W<sup>L</sup></italic>} and {<italic>b</italic><sup>1</sup>… <italic>b</italic><sup>(</sup><italic><sup>L</sup></italic><sup>-r)</sup>… <italic>b<sup>L</sup></italic>} are the matrices of weights and bias vectors of the DDAE NR model, respectively, whereas <italic>Relu</italic> represents the activation function rectified linear unit [<xref ref-type="bibr" rid="ref35">35</xref>]. The constant <italic>r</italic> is the index to identified the specific trainable layer. In this study, the second layer (ie, <italic>r</italic>=3) was chosen because, on average, substituting the second layer achieved the best performance in our pilot study. The detailed experimental results are shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
          <p>Based on the above idea, the original NI-DDAE, trained with a huge database of noise samples, can be transformed into many ND-DDAE models according to the type of background noise. In this study, 12 common types of background noise were used; hence, 12 ND-DDAE models were derived from the NI-DDAE model. More specifically, each ND-DDAE model was determined by optimizing the following objective function:</p>
          <p><graphic xlink:href="jmir_v23i10e25460_fig17.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (<bold>5</bold>)</p>
          <p><graphic xlink:href="jmir_v23i10e25460_fig18.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/> (<bold>6</bold>)</p>
          <p>where <italic>M</italic> is the total number of training samples and F(<inline-graphic xlink:href="jmir_v23i10e25460_fig19.png" xlink:type="simple" mimetype="image"/>) is the loss function derived from <inline-graphic xlink:href="jmir_v23i10e25460_fig9.png" xlink:type="simple" mimetype="image"/> and <bold><italic>X<sub>j</sub><sup>LPS</sup><sub>.</sub></italic></bold> <inline-graphic xlink:href="jmir_v23i10e25460_fig9.png" xlink:type="simple" mimetype="image"/> is the vector that contains the logarithmic amplitudes of the enhanced speech corresponding to the paired noisy LPS feature <bold><italic>Y<sub>j</sub><sup>LPS</sup></italic></bold>. Subsequently, the trained NI-DDAE provides the initial parameters for the ND-DDAE model, and the noise data of the specific environment are used to fine-tune this ND-DDAE model. Finally, the transformed LPS feature <inline-graphic xlink:href="jmir_v23i10e25460_fig9.png" xlink:type="simple" mimetype="image"/> is sent to the waveform recovery unit to reconstruct the waveform. More specifically, <inline-graphic xlink:href="jmir_v23i10e25460_fig9.png" xlink:type="simple" mimetype="image"/> is first processed using square root and exponential operations. The waveform recovery function then reconstructs the enhanced speech <inline-graphic xlink:href="jmir_v23i10e25460_fig9.png" xlink:type="simple" mimetype="image"/> with the noisy phase <bold><italic>Y<sup>phase</sup></italic></bold>.</p>
        </sec>
      </sec>
      <sec>
        <title>Training and Evaluation Procedure</title>
        <p>In this section, we show how the NC, DDAE, and NC+DDAE_T models were trained. First, we trained a new NC model according to the 12 common background noises, 2talker_unseen1, 2talker_unseen2, Construction Jackhammer (CJ), 2 Talker, Cafeteria, MRT (Mass Rapid Transit), cafeteria, Toy-Squeeze-Several, speech shape noise from the Institute of Electrical and Electronics Engineers (SSN_IEEE), Siren, Multiple type noise 1, and Multiple type noise 2, which are shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>. Note that the training approach is described in the previous section “NC Model”. After the training, the prediction accuracy of the 12 noises was 100%. The detailed results of the confusion matrix are shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <p>To train the DDAE NR model, the Taiwan Mandarin version of the hearing in noise test (TMHINT) corpus [<xref ref-type="bibr" rid="ref36">36</xref>] was selected to conduct all experiments, including the training and evaluation parts. All 320 sentences, each consisting of 10 characters, were recorded at a 16 kHz sampling rate, after which 120 utterances among the TMHINT corpus were selected and corrupted by 120 noise types [<xref ref-type="bibr" rid="ref15">15</xref>] at 7 SNR levels (−10, −7, −4, −1, 1, 4, 7, and 10 dB) as the training set for the DDAE model. The other 200 utterances were also corrupted with the 12 common background noises—as mentioned in the description of NC training—at 6 SNR levels (-6, -3, 0, 3, and 6 dB) as the outside testing set. In our previous study, this trained model was defined as the NI-DDAE.</p>
        <p>Next, we combined the NC with NI-DDAE and fine-tuned the model with each noise type in the NC, and the NI-DDAE was transformed into NC+DDAE_T. In the fine-tuning step, we could freeze or adopt each layer in the NI-DDAE. Previously, we had studied which layer of the NI-DDAE model had to be replaced to achieve the best performance. We substituted each layer by modifying <italic>r</italic> in the range from 1 to 5; meanwhile, we conducted 2 well-known objective speech evaluations, PESQ [<xref ref-type="bibr" rid="ref27">27</xref>] and STOI [<xref ref-type="bibr" rid="ref28">28</xref>], to identify the most appropriate layer. On average, replacing the middle layer of the NI-DDAE model (ie, the second layer this study) achieved a better performance than did substituting other layers. The detailed results can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Hence, we uniformly replaced the parameters of the third layer in all subsequent tests. As the 2 DL-based NR systems, DDAE and NC+DDAE, achieved better performances in our previous studies [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref21">21</xref>] than did the well-known unsupervised NR algorithms, the log minimum mean squared error [<xref ref-type="bibr" rid="ref3">3</xref>] and Karhunen-Loéve transform [<xref ref-type="bibr" rid="ref37">37</xref>], we used the DDAE and NC+DDAE algorithms for comparisons to evaluate the NC+DDAE_T in this study.</p>
        <p>Subsequently, we enrolled 10 CI users to conduct speech intelligibility tests, and details of these subjects are shown in the <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. This study protocol was approved by the Research Ethics Review Committee of Cheng Hsin Hospital under the following approval number: CHGH-IRB (645) 107A-17-2. The first author, LPHL, explained the study to the patients and collected the signed institutional review board informed consent before the experiment. All participants used their own clinical speech processors and temporarily disabled the built-in NR functions during the test. The test signals of noisy and enhanced speech were played at 65 dB sound pressure level by a speaker and were then processed through a CI processor to simulate the performance of each NR approach for CI users. To ensure that fatigue did not affect the study participants, each individual only heard a total of 16 test conditions (2 background noise [2 talker and CJ] × 2 SNR levels [0 and 3 dB] × 4 signal processing systems [noisy, DDAE, NC+DDAE, and NC+DDAE_T]) with 10 sentences of 10 words in each test condition. The participants were instructed to repeat verbally what they had heard. We evaluated the speech intelligibility under each test condition using the word correct rate (WCR) [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref42">42</xref>] calculated as the ratio between the number of correctly identified words and the total number of words. To further prevent participant fatigue, tests were paused for 5 minutes every 30 minutes. Moreover, we calculated the statistical power to see whether the sample size (10 patients in this study) was large enough to obtain a significant difference in the result. The statistical power of this study is 1. According to Cohen et al [<xref ref-type="bibr" rid="ref43">43</xref>] a statistical power over 0.8 is sufficiently high to conclude that there is a significant difference in the hypothesis.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Spectrograms of the 12 noise signals: (a) 2T_BG_1, (b) 2T_BG_2, (c) CJ, (d) 2T_BB, (e) Cafeteria, (f) MRT, (g) House Fan, (h) Toy-Squeeze-Several, (i) SSN_IEEE, (j) Siren, (k) Multiple type noise 1, and (l) Multiple type noise 2. 2T_BG_1 is a noise that mixes the speech of a girl and a boy both speaking repeatedly in English. 2T_BG_1 is a noise that mixes the speech of a girl and a boy both speaking repeatedly in English. The speakers in 2T_BG_2 are the same as those in 2T_BG_1 but with different sentences. 2T_BB is a noise that overlays 2 sentences in Chinese spoken by the same male speaker. Multiple type noise 1 is a mix of the sound of sirens and cheering crowd, whereas Multiple type noise 2 is a sound combining scratching and booing. The other samples are common background noises from daily life. 2T_BB: 2 Talker; 2T_BG_1: 2 talker_unseen1; 2T_BG_2: 2 talker_unseen2; CJ: Construction Jackhammer; MRT: Mass Rapid Transit; SSN_IEEE: speech shape noise from the Institute of Electrical and Electronics Engineers.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Objective Evaluation Using PESQ and STOI Scores</title>
        <p>We compared the newly proposed NC+DDAE_T with the previously established NR systems, DDAE and NC+DDAE. The PESQ and STOI scores of these tests are shown in <xref rid="figure4" ref-type="fig">Figures 4</xref> and <xref rid="figure5" ref-type="fig">5</xref>, respectively. As demonstrated in <xref rid="figure4" ref-type="fig">Figure 4</xref>, the PESQ scores of the proposed NC+DDAE_T are generally similar to those of the NC+DDAE. The details regarding the average scores of each approach (ie, noisy, DDAE, NC+DDAE, and NC+DDAE_T) for the 12 background noises at 6 different SNR levels can be found in Table A1 of <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. In the STOI scores, the NC+DDAE_T model also achieved the same level as did the NC+DDAE (<xref rid="figure5" ref-type="fig">Figure 5</xref>). The detailed STOI scores are listed in Table A2 of <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. These objective evaluation results proved that the NC+DDAE_T could provide almost the same speech intelligibility performance as the NC+DDAE.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Mean perceptual evaluation of speech quality (PESQ) scores of the 4 noise reduction approaches. 2T_BB: 2 Talker; 2T_BG_1: 2 talker_unseen1; 2T_BG_2: 2 talker_unseen2; CJ: Construction Jackhammer; dB: decibel; DDAE: deep denoising autoencoder; NC: noise classifier; NC+DDAE_T: noise classifier + deep denoising autoencoder with knowledge transfer; MRT: Mass Rapid Transit; PESQ: perceptual evaluation of speech quality; SNR: signal-to-noise ratio; SSN_IEEE: speech shape noise from  the Institute of Electrical and Electronics Engineers.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Mean short-time objective intelligibility (STOI) scores of the different noise reduction approaches. 2T_BB: 2 Talker; 2T_BG_1: 2 talker_unseen1; 2T_BG_2: 2 talker_unseen2; CJ: Construction Jackhammer; DDAE: deep denoising autoencoder; NC: noise classifier; NC+DDAE_T: noise classifier + deep denoising autoencoder with knowledge transfer; MRT: Mass Rapid Transit; SNR: signal-to-noise ratio; SSN_IEEE: speech shape noise from the  Institute of Electrical and Electronics Engineers; STOI: short-time objective intelligibility.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Recognition in Listening Tests</title>
        <p><xref rid="figure6" ref-type="fig">Figure 6</xref> shows the average WCR scores of 10 individuals with CI in the 2 Talker and CJ noise conditions each at 0- and 3-dB SNR levels. The detailed results are as follows: The respective average WCR scores and standard error of the mean (SEM) for noisy, DDAE, NC+DDAE, and NC+DDAE_T with 2 Talker background noise were 4.1 (SEM 1.87), 27.8 (SEM 5.42), 38.9 (SEM 8.83), and 43.2 (SEM 9.33) at the 0-dB SNR level; and 10.3 (SEM 3.84), 27.7 (SEM 5.24), 48.2 (SEM 9.69), and 50.3 (SEM 8.98) at the 3-dB SNR level. In the CJ background noise, the respective average scores and SEMs were 19.3 (SEM 5.76), 27.7 (SEM 5.24), 42.2 (SEM 9.64), and 50.6 (SEM 10.0) at the 0-dB SNR level; and 37.1 (SEM 9.84), 38.8 (SEM 8.41), 49.3 (SEM9.31), and 50.9 (SEM 10.13) at the 3-dB SNR level. These results demonstrated that the NC+DDAE_T provided better speech intelligibility scores than did noisy speech. Moreover, the newly developed NC+DDAE_T model achieved slightly higher intelligibility performances than did the NC+DDAE approach under most test conditions. The 1-way analysis of variance (ANOVA) [<xref ref-type="bibr" rid="ref44">44</xref>] with least significant difference post hoc comparison [<xref ref-type="bibr" rid="ref45">45</xref>] was used to analyze the results of the 4 NR systems (noisy, DDAE, NC+DDAE, and NC+DDAE_T) in the 4 test conditions. The 1-way ANOVA result confirmed that the WCR scores differed significantly among the 4 systems (<italic>F</italic>=13.256; <italic>P</italic>&#60;.001). The least significant difference post hoc comparisons (<xref ref-type="table" rid="table1">Table 1</xref>) further revealed that the noisy condition was significantly different from the other 3 systems (DDAE: <italic>P</italic>=.16; NC+DDAE: <italic>P</italic>&#60;.001; NC+DDAE_T: <italic>P</italic>&#60;.001). Meanwhile, the differences between the NC+DDAE and NC+DDAE_T models were not significant (<italic>P</italic>=.50).</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Mean intelligibility scores of 10 participants with cochlear implants in 4 types of simulated test conditions. 2T_BB: 2 Talker;  CJ: Construction Jackhammer; dB: decibel; DDAE: deep denoising autoencoder; NC: noise classifier; NC+DDAE_T: noise classifier + deep denoising autoencoder with knowledge transfer.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>The mean difference, standard error, and significance of the listening test in each noise reduction system.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="450"/>
            <col width="0"/>
            <col width="440"/>
            <col width="0"/>
            <col width="80"/>
            <thead>
              <tr valign="bottom">
                <td colspan="3">Method (I) by test (J)</td>
                <td colspan="2">Mean difference (I–J) (standard error)</td>
                <td><italic>P</italic> value<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>Noisy (I)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DDAE<sup>b</sup> (J)</td>
                <td colspan="2">–13.18 (5.428)</td>
                <td colspan="2">
                  <italic>.016</italic>
                  <sup>c</sup>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NC<sup>d</sup>+DDAE (J)</td>
                <td colspan="2">–26.95 (5.428)</td>
                <td colspan="2">
                  <italic>&#60;</italic>
                  <italic>.001</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NC+DDAE_T<sup>e</sup> (J)</td>
                <td colspan="2">–30.60 (5.428)</td>
                <td colspan="2">
                  <italic>&#60;</italic>
                  <italic>.001</italic>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>DDAE (I)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy (J)</td>
                <td colspan="2">13.18 (5.428)</td>
                <td colspan="2">
                  <italic>.</italic>
                  <italic>02</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NC+DDAE (J)</td>
                <td colspan="2">–13.78 (5.428)</td>
                <td colspan="2">
                  <italic>.01</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NC+DDAE_T (J)</td>
                <td colspan="2">–17.43 (5.428)</td>
                <td colspan="2">
                  <italic>.002</italic>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>NC+DDAE (I)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy (J)</td>
                <td colspan="2">26.95 (5.428)</td>
                <td colspan="2">
                  <italic>&#60;</italic>
                  <italic>.001</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DDAE (J)</td>
                <td colspan="2">13.78 (5.428)</td>
                <td colspan="2">
                  <italic>.01</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NC+DDAE_T (J)</td>
                <td colspan="2">–3.65 (5.428)</td>
                <td colspan="2">.50</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>NC+DDAE_T (I)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Noisy (J)</td>
                <td colspan="2">30.60 (5.428)</td>
                <td colspan="2">
                  <italic>&#60;</italic>
                  <italic>.001</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>DDAE (J)</td>
                <td colspan="2">17.43 (5.428)</td>
                <td colspan="2">
                  <italic>.</italic>
                  <italic>002</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>NC+DDAE (J)</td>
                <td colspan="2">3.65 (5.428)</td>
                <td colspan="2">.50</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup><italic>P</italic> values are significant at α = .05. Least significant difference was selected to conduct post hoc testing.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>DDAE: deep denoising autoencoder.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Values in italics represent significant values.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>NC: noise classifier.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>NC+DDAE_T: noise classifier + deep denoising autoencoder with knowledge transfer.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Comparison of the Numbers of Parameters</title>
        <p>The original structure of the NC+DDAE system used 12 ND+DDAEs and 1 NI+DDAE for the NR. In this study, the newly developed NC+DDAE_T system only needed 1 NI+DDAE and 12 different layer parameters to achieve the same performance as the previous NC+DDAE system. We further compared the numbers of parameters between the NC+DDAE and NC+DDAE_T approaches. The NC+DDAE_T approach required only 0.1 million parameters while the previous NC+DDAE system needed 4.4 million parameters. The number of parameters was thus reduced by 76.5% compared to the previous approach.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Layers for Substitution</title>
        <p>This study proposed a new NC+DDAE_T NR model that helps CI users to improve speech intelligibility in noisy listening conditions. Knowledge transfer technology was used to reduce the parameter requirements in comparison to the previous NC+DDAE approach. The experimental results of the objective evaluation and the subjective listening tests demonstrated that the NC+DDAE_T achieved performances comparable to those of the NC+DDAE approach, while the number of parameters used by the NC+DDAE_T was reduced by 76.5% compared to the NC+DDAE. Therefore, knowledge transfer technology could be a useful approach to further improve the benefits of NC+DDAE in reducing the cost of implementation in the future.</p>
        <p>The architecture of the NC+DDAE_T, (ie, which layer is substituted) is the basis for achieving higher performance with this novel system compared to the NC-DDAE. According to the objective evaluation by PESQ and STOI scores (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), the substitution of the middle layer can achieve better performances. To further analyze why the middle layer was so important, <italic>t</italic>-distributed stochastic neighbor embedding (<italic>t</italic>-SNE) [<xref ref-type="bibr" rid="ref46">46</xref>] was used to visualize the features that output by each layer. The acoustic features of noisy and clean speech (ie, LPS) were the inputs for the trained NI-DDAE NR model. The output features of each NI-DDAE layer were analyzed using <italic>t</italic>-SNE, which can project the distribution of each layer onto a 2D plane. <xref rid="figure7" ref-type="fig">Figure 7</xref> shows the results of this feature visualization. Green dots represent the output features of clean speech, whereas blue dots indicate features of noisy speech. The less overlap is apparent between the green and blue areas, the better the layer can separate the features. These results indicate that clean and noisy data were primarily separated in the output from h<sup>(2)</sup> and h<sup>(3)</sup>, implying that the front layers help to distinguish noisy speech from clean features and thus could be the most important layers. This interpretation is also consistent with the objective evaluation results in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>To explain the phenomenon illustrated in <xref rid="figure7" ref-type="fig">Figure 7</xref>, we suggest that the NC+DDAE_T model may work similarly to the human brain. The first layers of the model may try to separate the noise from the speech features. Therefore, these features would diverge completely in the middle layers of this NR model. The model would then try to reconstruct the enhanced speech and lower the volume of the noise in the final layers of the model; hence, the features would converge again in the <italic>t</italic>-SNE analysis. Based on these hypotheses, the second layer may be the key to feature separation because the features are well separated after the second layer. Therefore, to adapt the NR model to a specific type of noise, substituting the second layer would be the best choice, which corresponds to the results of the objective evaluation. The other parts of the NC+DDAE_T model may work as preprocessing and vocoder units. These parts are common units of all NR models; thus, different ND-DDAEs can share the same weight and bias values. Therefore, the concept of knowledge transfer can be used in this part to decrease the size of each model.</p>
        <fig id="figure7" position="float">
          <label>Figure 7</label>
          <caption>
            <p><italic>t</italic>-distributed stochastic neighbor embedding (<italic>t</italic>-SNE) feature analysis of each layer in the noise-independent deep denoising autoencoder (NI-DDAE) model with noisy and clean speech data. The green dots represent the output features of clean speech and the blue dots indicate features of noisy speech. 2T_BB: 2 Talker; CJ: Construction Jackhammer.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Future Perspectives</title>
        <p>Based on previous and current results of objective evaluation and listening tests, we can conclude that the proposed NC+DDAE_T performs comparably to the NC+DDAE. In addition, the NC+DDAE_T needs only a quarter of the number of parameters compared to the 12 ND-DDAE models. These characteristics suggest a great potential for future implementation of the NC+DDAE_T model. With the decreased number of parameters, an implemented device would require less memory. To prove this concept, we have implemented the NC+DDAE_T architecture in an app on an iPhone XR mobile phone (Apple Inc) as shown in <xref rid="figure8" ref-type="fig">Figure 8</xref>. The processing time could satisfy the maximum group delay requirement of assistive listening devices. With this advantage of edge computing, the proposed NC+DDAE_T may become a new kind of hearing assistive technology in the near future.</p>
        <fig id="figure8" position="float">
          <label>Figure 8</label>
          <caption>
            <p>Schematic of the noise classifier deep denoising autoencoder with knowledge transfer (NC+DDAE_T) implementation.</p>
          </caption>
          <graphic xlink:href="jmir_v23i10e25460_fig8.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The proposed NC+DDAE_T is an adaptable NR system, which means that the system benefits may be affected by the training data (eg, background noise types, speakers). Therefore, if the proposed system faces noisy conditions that are very different from the training data (ie, mismatch conditions), the proposed system would require major improvements, and new recordings of noise data may be needed. Overcoming this issue requires future study. Additionally, although the proposed system was implemented in an app, the full implementation of the proposed system in the hardware of currently used CI devices is still a way off. However, as studies increasingly focus on the acceleration of DL-based models in microprocessors [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>], there is a greater chance that DL technologies may be implemented into CI devices in the near future.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study proposed a novel NC+DDAE_T system for NR in CI devices. The knowledge transfer approach was used to lower the number of parameters of the DDAE model. The experimental results of the objective evaluations, along with the listening tests, showed that the proposed NC+DDAE_T model provided comparable performance to the previously established NC+DDAE NR model. These results suggest that the proposed NC+DDAE_T model may be a new NR system that can enable CI users to hear well in noisy conditions.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Results Following Replacement of Each Layer of weight and bias of the Deep Denoising Autoencoder Model.</p>
        <media xlink:href="jmir_v23i10e25460_app1.docx" xlink:title="DOCX File , 126 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Confusion matrix of the 12 noise classifications.</p>
        <media xlink:href="jmir_v23i10e25460_app2.docx" xlink:title="DOCX File , 46 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Individual biographical data of the attended cochlear implant subjects.</p>
        <media xlink:href="jmir_v23i10e25460_app3.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Perceptual evaluation of speech quality and short-time objective intelligibility scores of different noise reduction systems.</p>
        <media xlink:href="jmir_v23i10e25460_app4.docx" xlink:title="DOCX File , 29 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ANOVA</term>
          <def>
            <p>analysis of variance</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CI</term>
          <def>
            <p>cochlear implant</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CJ</term>
          <def>
            <p>Construction Jackhammer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DDAE</term>
          <def>
            <p>deep denoising autoencoder</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">DL</term>
          <def>
            <p>deep learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">DNN</term>
          <def>
            <p>deep neural network</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LPS</term>
          <def>
            <p>log power spectra</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">MRT</term>
          <def>
            <p>Mass Rapid Transit</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">NC</term>
          <def>
            <p>noise classifier</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NC+DDAE_T</term>
          <def>
            <p>noise classifier + deep denoising autoencoder with knowledge transfer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">ND</term>
          <def>
            <p>noise-dependent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">NI</term>
          <def>
            <p>noise-independent</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">NR</term>
          <def>
            <p>noise reduction</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">PESQ</term>
          <def>
            <p>perceptual evaluation of speech quality</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb15">SEM</term>
          <def>
            <p>standard error of the mean</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb16">SNR</term>
          <def>
            <p>signal-to-noise ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb17">SSN_IEEE</term>
          <def>
            <p>speech shape noise from the Institute of Electrical and Electronics Engineers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb18">STOI</term>
          <def>
            <p>short-time objective intelligibility</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb19">TMHINT</term>
          <def>
            <p>Taiwan Mandarin version of the hearing in noise test</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb20"><italic>t</italic>-SNE</term>
          <def>
            <p><italic>t</italic>-distributed stochastic neighbor embedding</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb21">WCR</term>
          <def>
            <p>word correct rate</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This study was supported by the Ministry of Science and Technology of Taiwan (project #110-2218-E-A49A-501, #110-2314-B-350-003, #109-2218-E-010-004, and #108-2314-B-350 -002-MY2) and Cheng Hsin General Hospital (#CY10933).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gifford</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Shallop</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Peterson</surname>
              <given-names>AM</given-names>
            </name>
          </person-group>
          <article-title>Speech recognition materials and ceiling effects: considerations for cochlear implant programs</article-title>
          <source>Audiol Neurootol</source>
          <year>2008</year>
          <volume>13</volume>
          <issue>3</issue>
          <fpage>193</fpage>
          <lpage>205</lpage>
          <pub-id pub-id-type="doi">10.1159/000113510</pub-id>
          <pub-id pub-id-type="medline">18212519</pub-id>
          <pub-id pub-id-type="pii">000113510</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sladen</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Ricketts</surname>
              <given-names>TA</given-names>
            </name>
          </person-group>
          <article-title>Frequency importance functions in quiet and noise for adults with cochlear implants</article-title>
          <source>Am J Audiol</source>
          <year>2015</year>
          <month>12</month>
          <volume>24</volume>
          <issue>4</issue>
          <fpage>477</fpage>
          <lpage>86</lpage>
          <pub-id pub-id-type="doi">10.1044/2015_AJA-15-0023</pub-id>
          <pub-id pub-id-type="medline">26650018</pub-id>
          <pub-id pub-id-type="pii">2405181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ephraim</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Malah</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement using a minimum mean-square error log-spectral amplitude estimator</article-title>
          <source>IEEE Trans. Acoust., Speech, Signal Process</source>
          <year>1985</year>
          <month>04</month>
          <volume>33</volume>
          <issue>2</issue>
          <fpage>443</fpage>
          <lpage>445</lpage>
          <pub-id pub-id-type="doi">10.1109/tassp.1985.1164550</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rezayee</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gazor</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>An adaptive KLT approach for speech enhancement</article-title>
          <source>IEEE Trans. Speech Audio Process</source>
          <year>2001</year>
          <volume>9</volume>
          <issue>2</issue>
          <fpage>87</fpage>
          <lpage>95</lpage>
          <pub-id pub-id-type="doi">10.1109/89.902276</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Scalart</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement based on a priori signal to noise estimation</article-title>
          <year>1996</year>
          <conf-name>Speech enhancement based on a priori signal to noise estimation. In IEEE International Conference on Acoustics, Speech, and Signal Processing Conference Proceedings(Vol. 2, pp. ). IEEE</conf-name>
          <conf-date>May 5, 1996</conf-date>
          <conf-loc>Atlanta, GA, USA</conf-loc>
          <fpage>629</fpage>
          <lpage>632</lpage>
          <pub-id pub-id-type="doi">10.1109/icassp.1996.543199</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>YC</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Young</surname>
              <given-names>ST</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of generalized maximum a posteriori spectral amplitude (GMAPA) speech enhancement algorithm in hearing aids</article-title>
          <year>2013</year>
          <conf-name>2013 IEEE International Symposium on Consumer Electronics (ISCE)</conf-name>
          <conf-date>June 2013</conf-date>
          <conf-loc>Hsinchu,  Taiwan</conf-loc>
          <fpage>245</fpage>
          <lpage>246</lpage>
          <pub-id pub-id-type="doi">10.1109/isce.2013.6570208</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dawson</surname>
              <given-names>PW</given-names>
            </name>
            <name name-style="western">
              <surname>Mauger</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hersbach</surname>
              <given-names>AA</given-names>
            </name>
          </person-group>
          <article-title>Clinical evaluation of signal-to-noise ratio–based noise reduction in Nucleus cochlear implant recipients</article-title>
          <source>Ear and hearing</source>
          <year>2011</year>
          <volume>32</volume>
          <issue>3</issue>
          <fpage>382</fpage>
          <pub-id pub-id-type="doi">10.1097/aud.0b013e318201c200</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Loizou</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Lobo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Subspace algorithms for noise reduction in cochlear implants</article-title>
          <source>The Journal of the Acoustical Society of America</source>
          <year>2005</year>
          <month>11</month>
          <volume>118</volume>
          <issue>5</issue>
          <fpage>2791</fpage>
          <lpage>2793</lpage>
          <pub-id pub-id-type="doi">10.1121/1.2065847</pub-id>
          <pub-id pub-id-type="medline">16334894</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mauger</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Dawson</surname>
              <given-names>PW</given-names>
            </name>
            <name name-style="western">
              <surname>Hersbach</surname>
              <given-names>AA</given-names>
            </name>
          </person-group>
          <article-title>Perceptually optimized gain function for cochlear implant signal-to-noise ratio based noise reduction</article-title>
          <source>The Journal of the Acoustical Society of America</source>
          <year>2012</year>
          <month>01</month>
          <volume>131</volume>
          <issue>1</issue>
          <fpage>327</fpage>
          <lpage>336</lpage>
          <pub-id pub-id-type="doi">10.1121/1.3665990</pub-id>
          <pub-id pub-id-type="medline">22280595</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement based on teacher? Student deep learning using improved speech presence probability for noise-robust speech recognition</article-title>
          <source>IEEE/ACM Trans. Audio Speech Lang. Process</source>
          <year>2019</year>
          <month>12</month>
          <volume>27</volume>
          <issue>12</issue>
          <fpage>2080</fpage>
          <lpage>2091</lpage>
          <pub-id pub-id-type="doi">10.1109/taslp.2019.2940662</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Supervised speech separation based on deep learning: an overview</article-title>
          <source>IEEE/ACM Trans Audio Speech Lang Process</source>
          <year>2018</year>
          <month>10</month>
          <volume>26</volume>
          <issue>10</issue>
          <fpage>1702</fpage>
          <lpage>1726</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://europepmc.org/abstract/MED/31223631"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/TASLP.2018.2842159</pub-id>
          <pub-id pub-id-type="medline">31223631</pub-id>
          <pub-id pub-id-type="pmcid">PMC6586438</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A one-microphone algorithm for reverberant speech enhancement</article-title>
          <year>2003</year>
          <conf-name>IEEE International Conference on Acoustics, Speech, and Signal Processing</conf-name>
          <conf-date>May 2003</conf-date>
          <conf-loc>Hong Kong, China</conf-loc>
          <pub-id pub-id-type="doi">10.1109/icassp.2003.1198925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Healy</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Delfarah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vasko</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>BL</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>An algorithm to increase intelligibility for hearing-impaired listeners in the presence of a competing talker</article-title>
          <source>The Journal of the Acoustical Society of America</source>
          <year>2017</year>
          <month>06</month>
          <volume>141</volume>
          <issue>6</issue>
          <fpage>4230</fpage>
          <lpage>4239</lpage>
          <pub-id pub-id-type="doi">10.1121/1.4984271</pub-id>
          <pub-id pub-id-type="medline">28618817</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Florencio</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement in multiple-noise conditions using deep neural networks</article-title>
          <source>arXiv:1605.02427</source>
          <year>2016</year>
          <access-date>2016-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1605.02427">https://arxiv.org/abs/1605.02427</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A regression approach to speech enhancement based on deep neural networks</article-title>
          <source>IEEE/ACM Trans. Audio Speech Lang. Process</source>
          <year>2015</year>
          <month>1</month>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>7</fpage>
          <lpage>19</lpage>
          <pub-id pub-id-type="doi">10.1109/taslp.2014.2364452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Matsuda</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hori</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement based on deep denoising autoencoder</article-title>
          <source>Interspeech</source>
          <year>2013</year>
          <month>08</month>
          <conf-name>Interspeech 2013</conf-name>
          <conf-date>25-29 August 2013</conf-date>
          <conf-loc>Lyon, France</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>LR</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Dynamic noise aware training for speech enhancement based on deep neural networks</article-title>
          <year>2014</year>
          <conf-name>Fifteenth Annual Conference of the International Speech Communication Association</conf-name>
          <conf-date>Sept 14, 2014</conf-date>
          <conf-loc>Singapore</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>A deep denoising autoencoder approach to improving the intelligibility of vocoded speech in cochlear implant simulation</article-title>
          <source>IEEE Trans. Biomed. Eng</source>
          <year>2017</year>
          <month>7</month>
          <volume>64</volume>
          <issue>7</issue>
          <fpage>1568</fpage>
          <lpage>1578</lpage>
          <pub-id pub-id-type="doi">10.1109/tbme.2016.2613960</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goehring</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bolner</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Monaghan</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>van Dijk</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zarowski</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bleeck</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Speech enhancement based on neural networks improves speech intelligibility in noise for cochlear implant users</article-title>
          <source>Hearing Research</source>
          <year>2017</year>
          <month>02</month>
          <volume>344</volume>
          <fpage>183</fpage>
          <lpage>194</lpage>
          <pub-id pub-id-type="doi">10.1016/j.heares.2016.11.012</pub-id>
          <pub-id pub-id-type="medline">27913315</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goehring</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Keshavarzi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Carlyon</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>BCJ</given-names>
            </name>
          </person-group>
          <article-title>Using recurrent neural networks to improve the perception of speech in non-stationary noise by people with cochlear implants</article-title>
          <source>The Journal of the Acoustical Society of America</source>
          <year>2019</year>
          <month>07</month>
          <volume>146</volume>
          <issue>1</issue>
          <fpage>705</fpage>
          <lpage>718</lpage>
          <pub-id pub-id-type="doi">10.1121/1.5119226</pub-id>
          <pub-id pub-id-type="medline">31370586</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y. H.</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>Y. T.</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>K. C.</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y. H.</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>P. H.</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>C. H</given-names>
            </name>
          </person-group>
          <article-title>Deep learning–based noise reduction approach to improve speech intelligibility for cochlear implant recipients</article-title>
          <source>Ear and hearing</source>
          <year>2018</year>
          <volume>39</volume>
          <issue>4</issue>
          <fpage>795</fpage>
          <lpage>809</lpage>
          <pub-id pub-id-type="doi">10.1097/aud.0000000000000537</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>A survey on transfer learning</article-title>
          <source>IEEE Trans. Knowl. Data Eng</source>
          <year>2010</year>
          <month>10</month>
          <volume>22</volume>
          <issue>10</issue>
          <fpage>1345</fpage>
          <lpage>1359</lpage>
          <pub-id pub-id-type="doi">10.1109/tkde.2009.191</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Latif</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Younis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Qadir</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Epps</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning for improving speech emotion classification accuracy</article-title>
          <source>arXiv</source>
          <year>2018</year>
          <access-date>2018-01-19</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1801.06353">https://arxiv.org/abs/1801.06353</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Soong</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Multi-speaker modeling and speaker adaptation for DNN-based TTS synthesis</article-title>
          <year>2015</year>
          <conf-name>International Conference on Acoustics, Speech and Signal Processing</conf-name>
          <conf-date>April 2015</conf-date>
          <conf-loc>Toronto, Ontario, Canada</conf-loc>
          <fpage>4475</fpage>
          <lpage>4479</lpage>
          <pub-id pub-id-type="doi">10.1109/icassp.2015.7178817</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Weiss</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>A transfer learning and progressive stacking approach to reducing deep model sizes with an application to speech enhancement</article-title>
          <source>Advances in neural information processing systems</source>
          <year>2018</year>
          <fpage>4480</fpage>
          <lpage>4490</lpage>
          <comment><ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1806.04558"/>
              \</comment>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Siniscalchi</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Transfer learning and progressive stacking approach to reducing deep model sizes with an application to speech enhancement</article-title>
          <year>2017</year>
          <conf-name>International Conference on Acoustics, Speech and Signal Processing</conf-name>
          <conf-date>March 2017</conf-date>
          <conf-loc>New Orleans, LA, USA</conf-loc>
          <fpage>5575</fpage>
          <lpage>5579</lpage>
          <pub-id pub-id-type="doi">10.1109/icassp.2017.7953223</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rix</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Beerends</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Hollier</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Hekstra</surname>
              <given-names>AP</given-names>
            </name>
          </person-group>
          <article-title>Perceptual evaluation of speech quality (PESQ)-a new method for speech quality assessment of telephone networks and codecs</article-title>
          <year>2001</year>
          <month>05</month>
          <conf-name>International Conference on Acoustics, Speech, and Signal Processing</conf-name>
          <conf-date>May 7-11, 2001</conf-date>
          <conf-loc>Salt Lake City, UT, USA</conf-loc>
          <fpage>749</fpage>
          <lpage>752</lpage>
          <pub-id pub-id-type="doi">10.1109/icassp.2001.941023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taal</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Hendriks</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Heusdens</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jensen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A short-time objective intelligibility measure for time-frequency weighted noisy speech</article-title>
          <year>2010</year>
          <conf-name>International Conference on Acoustics, Speech and Signal Processing</conf-name>
          <conf-date>April 2010</conf-date>
          <conf-loc>Dallas, TX, USA</conf-loc>
          <fpage>4214</fpage>
          <lpage>4217</lpage>
          <pub-id pub-id-type="doi">10.1109/icassp.2010.5495701</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Du</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Huo</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <article-title>A speech enhancement approach using piecewise linear approximation of an explicit model of environmental distortions</article-title>
          <year>2008</year>
          <conf-name>Ninth Annual Conference of Tthe International Speech Communication Association</conf-name>
          <conf-date>Sept 2008</conf-date>
          <conf-loc>Brisbane, Australia</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Davis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mermelstein</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Comparison of parametric representations for monosyllabic word recognition in continuously spoken sentences</article-title>
          <source>IEEE Trans. Acoust., Speech, Signal Process</source>
          <year>1980</year>
          <month>08</month>
          <volume>28</volume>
          <issue>4</issue>
          <fpage>357</fpage>
          <lpage>366</lpage>
          <pub-id pub-id-type="doi">10.1109/tassp.1980.1163420</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vinyals</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Distilling the knowledge in a neural network</article-title>
          <source>arXiv</source>
          <year>2015</year>
          <access-date>2015-03-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1503.02531">https://arxiv.org/abs/1503.02531</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Learning deep architectures for AI</article-title>
          <source>FNT in Machine Learning</source>
          <year>2009</year>
          <publisher-loc>Norwell, MA</publisher-loc>
          <publisher-name>Now Publishers Inc</publisher-name>
          <fpage>1</fpage>
          <lpage>127</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mohamed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dahl</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Acoustic modeling using deep belief networks</article-title>
          <source>IEEE Trans. Audio Speech Lang. Process</source>
          <year>2012</year>
          <month>01</month>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>14</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1109/tasl.2011.2109382</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mengusoglu</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Ris</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Use of acoustic prior information for confidence measure in ASR applications</article-title>
          <year>2001</year>
          <conf-name>Seventh European Conference on Speech Communication and Technology</conf-name>
          <conf-date>Sept 13, 2021</conf-date>
          <conf-loc>Virtual</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nair</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>Rectified linear units improve restricted boltzmann machines</article-title>
          <source>ICML</source>
          <year>2010</year>
          <month>01</month>
          <access-date>2019-07-17</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/forum?id=rkb15iZdZB">https://openreview.net/forum?id=rkb15iZdZB</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>LLN</given-names>
            </name>
            <name name-style="western">
              <surname>Soli</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>MW</given-names>
            </name>
          </person-group>
          <article-title>Development of the Mandarin Hearing in Noise Test (MHINT)</article-title>
          <source>Ear Hear</source>
          <year>2007</year>
          <month>04</month>
          <volume>28</volume>
          <issue>2 Suppl</issue>
          <fpage>70S</fpage>
          <lpage>74S</lpage>
          <pub-id pub-id-type="doi">10.1097/AUD.0b013e31803154d0</pub-id>
          <pub-id pub-id-type="medline">17496652</pub-id>
          <pub-id pub-id-type="pii">00003446-200704001-00018</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mittal</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Phamdo</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Signal/noise KLT based approach for enhancing speech degraded by colored noise</article-title>
          <source>IEEE Trans. Speech Audio Process</source>
          <year>2000</year>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>159</fpage>
          <lpage>167</lpage>
          <pub-id pub-id-type="doi">10.1109/89.824700</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Loizou</surname>
              <given-names>PC</given-names>
            </name>
          </person-group>
          <article-title>Predicting the intelligibility of vocoded and wideband Mandarin Chinese</article-title>
          <source>The Journal of the Acoustical Society of America</source>
          <year>2011</year>
          <month>05</month>
          <volume>129</volume>
          <issue>5</issue>
          <fpage>3281</fpage>
          <lpage>3290</lpage>
          <pub-id pub-id-type="doi">10.1121/1.3570957</pub-id>
          <pub-id pub-id-type="medline">21568429</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>LLN</given-names>
            </name>
            <name name-style="western">
              <surname>Qiu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Azimi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The contribution of matched envelope dynamic range to the binaural benefits in simulated bilateral electric hearing</article-title>
          <source>J Speech Lang Hear Res</source>
          <year>2013</year>
          <month>08</month>
          <volume>56</volume>
          <issue>4</issue>
          <fpage>1166</fpage>
          <lpage>1174</lpage>
          <pub-id pub-id-type="doi">10.1044/1092-4388(2012/12-0255)</pub-id>
          <pub-id pub-id-type="medline">23926330</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yuan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of noise reduction methods for sentence recognition by Mandarin-speaking cochlear implant listeners</article-title>
          <source>Ear Hear</source>
          <year>2015</year>
          <month>01</month>
          <volume>36</volume>
          <issue>1</issue>
          <fpage>61</fpage>
          <lpage>71</lpage>
          <pub-id pub-id-type="doi">10.1097/AUD.0000000000000074</pub-id>
          <pub-id pub-id-type="medline">25127321</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Effects of adaptation rate and noise suppression on the intelligibility of compressed-envelope based speech</article-title>
          <source>PLoS ONE</source>
          <year>2015</year>
          <month>7</month>
          <day>21</day>
          <volume>10</volume>
          <issue>7</issue>
          <fpage>e0133519</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0133519</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Tsao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>HLS</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>YH</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>PH</given-names>
            </name>
          </person-group>
          <article-title>A deep learning based noise reduction approach to improve speech intelligibility for cochlear implant recipients in the presence of competing speech noise</article-title>
          <year>2017</year>
          <conf-name>Asia-Pacific Signal and Information Processing Association Annual Summit and Conference (APSIPA ASC)</conf-name>
          <conf-date>Dec 2017</conf-date>
          <conf-loc>Kuala Lumpur, Malaysia</conf-loc>
          <fpage>808</fpage>
          <lpage>812</lpage>
          <pub-id pub-id-type="doi">10.1109/APSIPA.2017.8282144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Statistical power analysis for the behavioural sciences</article-title>
          <source>Hillsdale, NJ: Laurence Erlbaum Associates</source>
          <year>1988</year>
          <publisher-loc>Cambridge, Massachusetts</publisher-loc>
          <publisher-name>Academic press</publisher-name>
          <fpage>273</fpage>
          <lpage>406</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dien</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Issues in the application of the average reference: Review, critiques, and recommendations</article-title>
          <source>Behavior Research Methods, Instruments, &#38; Computers</source>
          <year>1998</year>
          <month>3</month>
          <volume>30</volume>
          <issue>1</issue>
          <fpage>34</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.3758/bf03209414</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Abdi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Fisher’s least significant difference (LSD) test</article-title>
          <source>Encyclopedia of research design</source>
          <year>2010</year>
          <fpage>840</fpage>
          <lpage>853</lpage>
          <pub-id pub-id-type="doi">10.1007/978-1-4020-6754-9_9279</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>LVD</given-names>
            </name>
            <name name-style="western">
              <surname>Hinton</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Visualizing data using t-SNE. Journal of machine learning research</article-title>
          <source>Journal of Machine Learning Research</source>
          <year>2008</year>
          <fpage>2579</fpage>
          <lpage>2605</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Georgiev</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lane</surname>
              <given-names>ND</given-names>
            </name>
            <name name-style="western">
              <surname>Mascolo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chu</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Accelerating mobile audio sensing algorithms through on-chip gpu offloading</article-title>
          <year>2017</year>
          <conf-name>Proceedings of the 15th Annual International Conference on Mobile Systems, Applications, and Services</conf-name>
          <conf-date>June 2017</conf-date>
          <conf-loc>New York, NY, USA</conf-loc>
          <fpage>306</fpage>
          <lpage>318</lpage>
          <pub-id pub-id-type="doi">10.1145/3081333.3081358</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ran</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Deep learning with edge computing: a review</article-title>
          <source>Proc. IEEE</source>
          <year>2019</year>
          <month>8</month>
          <volume>107</volume>
          <issue>8</issue>
          <fpage>1655</fpage>
          <lpage>1674</lpage>
          <pub-id pub-id-type="doi">10.1109/jproc.2019.2921977</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
