<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e83790</article-id><article-id pub-id-type="doi">10.2196/83790</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Explainable and Interpretable AI for Voice and Speech Analysis in Clinical Care: Systematic Review</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ebraheem</surname><given-names>Mohamed</given-names></name><degrees>MSCS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Toghranegar</surname><given-names>Jamie</given-names></name><degrees>SLPD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><collab>Bridge2AI-Voice Consortium</collab><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Bensoussan</surname><given-names>Yael</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Templeton</surname><given-names>John Michael</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Bellini College of Artificial Intelligence, Cybersecurity and Computing, University of South Florida</institution><addr-line>4202 East Fowler Avenue</addr-line><addr-line>Tampa</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Otolaryngology Head and Neck Surgery, USF Health Voice Center, University of South Florida</institution><addr-line>Tampa</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff3"><institution>Medical Engineering, College of Engineering, University of South Florida</institution><addr-line>Tampa</addr-line><addr-line>FL</addr-line><country>United States</country></aff><aff id="aff6">Chao Amanda, Ma Linda, Rajkumar Gayathiri, Jenkins Kathy, Jo Stacy, Silberholz Elizabeth, Costello John, Diaz-Ocampo Enrique, Zeng Xijie, Rudzicz Frank, Moothedan Elijah, Brito Rahul, Ghaffar Omar, Siu Jennifer, Levinsky Justin, Russell Laurie, Samuel Joyce, Su Lala, Bevers Isaac, Jenney Kaley, Wilke Jordan, Ghosh Satrajit, Tu Julie, Zanin Madeleine, Casalino Selina, Mahajan Radhika, Anibal James, Dorr David, Bedrick Steven, Dalal Abhijeet, Hersh William, Michaels LeAnn, Talluri Venkata Swarna Mukhi, Goldenberg Anna, Miao Siyu, B&#x00E9;lisle-Pipon Jean-Christophe, Amraei Dona, Bernier Alexander, Blatter Alden, Cadillac L&#x00E9;o, Doherty-Kirby Amanda, English Renee, Gallois Hortense, Gaelyn Garrett C, Khawaja Zoha, Loewith Chloe, Malo Marie-Fran&#x00E7;oise, Varela Pablo Montoya, Pnacekova Michaela, Potter Jaiden, Premi-Bortolotto Claire, Taylor Luka, Victor Gavin, Wilson Claire, Jayachandran Lochana, Lapadula Elisa, Ravitsky Vardit, Ng Evan, Ghavanini Amer, Syed Toufeeq Ahmed, Awan Shaheen, Bolser Donald, Bensoussan Yael, Bahr Ruth, Watts Stephanie, Boyer Micah, Abdel-Aty Yassmeen, Armosh Kirollos, Martinez Ana Sophia Avila, Beltran Helena, Berrios Moroni, Brown John, Santiago Iris De, Ebraheem Mohamed, Eiseman Ellie, Elmahdy Mahmoud, Evangelista Emily, Hanna Karim, Jain Jennifer, Guardela Brenda Juan, Kalia Ayush, Kalia Megha, Kostelnik Cynthia, Krause Alisa, Leo Genelle, Maharaj Vrishni, Mikael Marian, Nafii Yosef, Neal Tempestt, Newberry Karlee, Nickel Christopher, Pharr Trevor, Rafatjou Parnaz, Rahman JM, Rossi Jillian, Stark John, Sudhakar Shrramana Ganesh, Toghranegar Jamie, Urbano Megan, Zesiewicz Theresa, Lerner-Ellis Jordan, Gelbard Alexander, Powell Maria, Brown Amy, Fletcher Kenneth, Kobayashi Kenji, Peltier Amanda, Pontell Matthew, Rohde Sarah, Riesthal Michael de, Cruz Samantha Salvi, Vinson Kimberly, Krussel Andrea, Payne Phillip, Sigaras Alexandros, Rameau Ana&#x00EF;s, Elemento Olivier, Ramos John, Tang Jeffrey, Zhao Robin, Zisimopoulos Pantelis</aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Amanna</surname><given-names>Adaobi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Pradhan</surname><given-names>Sojen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Xuancong</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Mohamed Ebraheem, MSCS, Bellini College of Artificial Intelligence, Cybersecurity and Computing, University of South Florida, 4202 East Fowler Avenue, Tampa, FL, 33620, United States, 1 8135850780; <email>mohamedusama@usf.edu</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>24</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e83790</elocation-id><history><date date-type="received"><day>09</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>17</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>20</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Mohamed Ebraheem, Jamie Toghranegar, Bridge2AI-Voice Consortium, Yael Bensoussan, John Michael Templeton. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 24.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e83790"/><abstract><sec><title>Background</title><p>Driven by recent advances in artificial intelligence (AI), particularly in medicine, audio-based voice and speech biomarkers are increasingly investigated for various medical applications as a complementary or even alternative modality to traditional medical devices. The adoption of deep learning techniques in recent literature is motivated by their superior performance compared to classical machine learning methods. However, ethical and regulatory concerns regarding the black-box nature of these models have limited their integration into clinical workflows. Consequently, explainable artificial intelligence (XAI) has recently been used to address this issue by generating explanations for opaque model outputs. Ideally, medical XAI systems aim to provide human-understandable, clinically grounded explanations essential for enhanced AI trustworthiness and, thereby, facilitate adoption into real-world clinical settings.</p></sec><sec><title>Objective</title><p>We conduct a systematic literature review of XAI methods applied for explaining deep learning techniques in audio-based voice and speech clinical applications. We aim to identify what XAI methods have been used to explain the decisions of deep learning voice and speech AI systems in health care, as well as XAI-informed insights. Additionally, we aim to contextualize these findings with respect to clinical applicability and stakeholder relevance. Lastly, we identify opportunities and recommendations for future clinical audio XAI design.</p></sec><sec sec-type="methods"><title>Methods</title><p>We used PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses). Six electronic databases (IEEE Xplore, ACM Digital Library, Scopus, PubMed, Web of Science, and Nature) were searched for papers published between January 2015 and February 2025. Eligible studies applied explainability or interpretability methods to deep learning models for voice or speech audio in health care contexts. Risk of bias was assessed using PROBAST+AI (Prediction Model Risk of Bias Assessment Tool). The results were thematically synthesized across explainability categories, input representations, clinical domains, validation strategies, and stakeholder considerations.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 30 studies met the inclusion criteria. These studies used a range of explainability approaches, including gradient-based methods, perturbation-based techniques, surrogate model&#x2013;based methods, model-internal representation analyses, concept-based detectors, and attention-based explanations. Applications spanned diverse clinical domains, including voice disorders, neurodegenerative diseases, psychiatric conditions, and traumatic brain injury. Overall, results indicate that most studies relied primarily on qualitative interpretation of explainability outputs, with limited quantitative validation of explanation consistency across external datasets. Furthermore, none of the included studies explicitly conducted human-in-the-loop evaluations with relevant stakeholders, highlighting a substantial gap in stakeholder alignment.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Current XAI practices in clinical voice and speech analysis are limited by insufficient validation, lack of domain-specific design, and misalignment with clinical stakeholder needs. This review highlights opportunities for developing validated, audio-aware, and stakeholder-centered XAI approaches to support trustworthy clinical deployment. Interpretation of these findings should consider limitations related to single-reviewer study selection, potential high-risk of bias, and the repeated use of benchmark datasets.</p></sec></abstract><kwd-group><kwd>explainable artificial intelligence</kwd><kwd>clinical voice analysis</kwd><kwd>speech biomarkers</kwd><kwd>deep learning</kwd><kwd>interpretability</kwd><kwd>medical decision support</kwd><kwd>trustworthy AI</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Overview</title><p>Voice is a rich modality that has garnered the interest of the research community for its potential in numerous medical applications [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Voice and speech biomarkers have recently been used for voice pathology detection [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>], voice quality assessment [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], neurodegenerative disease diagnosis [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], mental health monitoring [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], cardio-respiratory condition classification [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>], as well as automatic speech recognition (ASR) for disordered speech [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>]. The main reason behind this interest in audio-based voice and speech biomarkers is the costliness and invasiveness of traditional voice evaluation techniques, such as laryngoscopy, stroboscopy, laryngeal electromyography, and imaging technologies such as MRI (magnetic resonance imaging) and CT (computed tomography) scans, which limit accessibility for many. Alternatively, artificial intelligence (AI)&#x2013;driven audio-based medical systems pave the way for broader access to medical services for marginalized and underprivileged populations [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Yet, AI integration into real clinical settings is limited, in part due to the scarcity of high-quality data needed to train reliable and fair models [<xref ref-type="bibr" rid="ref22">22</xref>]. Consequently, myriad large-scale projects are underway for the purpose of collecting extensive and representative voice audio datasets. National Institutes of Health&#x2013;funded Bridge2AI is a multi-institution, large-scale project that aims to collect standardized, AI-ready, and ethically sourced voice data across various health conditions, where voice and speech samples have been collected from 442 participants [<xref ref-type="bibr" rid="ref23">23</xref>]. Similarly, AphasiaBank (part of TalkBank Project, Carnegie Mellon University) is another National Institutes of Health&#x2013;funded endeavor that has amassed multimodal data from 306 persons with aphasia [<xref ref-type="bibr" rid="ref24">24</xref>]. Launched in 2023 and funded through 2028, SpeechDx is a global initiative dedicated to creating an extensive dataset of Alzheimer&#x2019;s speech biomarkers and has recruited about 2000 participants [<xref ref-type="bibr" rid="ref25">25</xref>]. These efforts demonstrate the general recognition of the potential of voice and speech AI in routine clinical practice.</p><p>Recently, the trustworthiness of AI systems has been a central issue for clinical integration of AI, particularly the obscurity of the decision-making processes of deep learning models to end users [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. Known as the &#x201C;black-box&#x201D; problem, this issue is especially critical in medicine, where decisions directly impact patient safety. Yet, regulatory frameworks have struggled to keep pace with AI&#x2019;s rapid development, leaving unresolved questions of liability and accountability [<xref ref-type="bibr" rid="ref30">30</xref>]. Clinicians and regulators must understand how models operate, how reliable they are, and under what conditions they fail, before AI systems are integrated into clinical workflows in a meaningful way [<xref ref-type="bibr" rid="ref31">31</xref>]. Only then can patients be guaranteed safe, high-quality medical care. On one end, there are opinions against the use of black-box models at all for high-stakes environments such as medicine [<xref ref-type="bibr" rid="ref32">32</xref>]. While white-box, classical machine learning (ML) models such as decision trees or support vector machines trained on interpretable, hand-crafted features offer greater transparency, deep learning models consistently achieve superior performance; this is known as the interpretability-accuracy trade-off [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>Explainable artificial intelligence (XAI) has emerged to ameliorate this challenge, aiming to make black-box models more transparent [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. However, many XAI techniques have been developed for image-based domains, such as overlaying saliency maps on brain MRI or chest CT scans. In voice analysis, raw audio is often transformed into time-frequency representations such as spectrograms before being input into deep learning models. Mapping a region on a spectrogram back to an intuitively understandable auditory event (eg, a tremor on a specific syllable) is inherently more complex than identifying a visible tumor on an x-ray. The relationship between spectral features and perceived voice quality or pathology is often highly nonlinear, making clinical interpretation challenging.</p><p>Furthermore, the intuitiveness of model explanations varies substantially with respect to end users [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>]. Computer scientists and ML researchers may be able to interpret technical visualizations such as activation maps or feature attribution plots, whereas clinicians, patients, and regulators have different knowledge bases, priorities, and constraints. A &#x201C;one-size-fits-all&#x201D; approach to XAI design is therefore inappropriate, especially when explainability is positioned as a pathway to increasing trust. Thereby, multidisciplinary effort in the design of XAI methods is vital for providing appropriate presentations of explanations suited for the diverse backgrounds and needs of the respective stakeholders [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>].</p><p>In this context, this paper presents a systematic literature review of XAI methods applied to deep learning models for clinical voice and speech analysis. In this review, we use &#x201C;clinical audio&#x201D; and &#x201C;clinical voice and speech&#x201D; to refer to voice or speech data derived from individuals with a clinical condition used for a health care&#x2013;relevant application or outcome, regardless of whether the recording occurred in a controlled clinical environment. The review addresses the following research questions:</p><list list-type="bullet"><list-item><p>What XAI methods have been used to explain the decisions of deep learning voice and speech AI systems in health care?</p></list-item><list-item><p>What insights are derived from the application of these XAI methods?</p></list-item><list-item><p>What are the limitations of these XAI methods in the context of clinical audio in terms of clinical applicability and stakeholder relevance?</p></list-item></list><p>The remainder of this paper is organized as follows. The rest of the Introduction section presents background for XAI concepts and different perspectives regarding the definitions of explainability and interpretability. We also outline the objectives of XAI and provide a discussion on the broad application of XAI in diverse medical domains. The Methods section describes the systematic review methodology. The Results section presents explainability approaches, explainability validation strategies, and human-centered evaluation. Then, we discuss the limitations of current approaches, stakeholder alignment, and the impact of audio representation on interpretability, upon which future directions are presented.</p></sec><sec id="s1-2"><title>Background</title><sec id="s1-2-1"><title>Explainability vs Interpretability</title><p>There is no clear consensus in the literature on the definitions of explainability and interpretability in the context of AI, and the two terms are often used interchangeably. Nonetheless, various works have attempted to provide distinctions between them.</p><p>Linardatos et al [<xref ref-type="bibr" rid="ref39">39</xref>] highlight the persistent ambiguity surrounding these concepts and review the different ways they have been differentiated in literature. They describe interpretability as relating to the intuition behind a model&#x2019;s outputs, such that a more interpretable model makes it easier to identify causal relationships between inputs and outputs. On the other hand, explainability is concerned with the internal logic and mechanics of the model. They conclude that interpretability is the broader term, and that a model can be interpretable without being explainable.</p><p>Gilpin et al [<xref ref-type="bibr" rid="ref40">40</xref>] approach the distinction differently, defining explainability as a means of answering the questions &#x201C;why&#x201D; or &#x201C;why not&#x201D; a system behaves in a particular way. In contrast, they describe interpretability as the ability to represent the model&#x2019;s internal processes in a human-understandable form, emphasizing that this is dependent on the knowledge and needs of the target user.</p><p>Das and Rad [<xref ref-type="bibr" rid="ref41">41</xref>] define interpretability as a quality of a system in which its expressions convey human-understandable insights into how it works. They differentiate explanations as additional metadata&#x2014;produced either by the model or an external algorithm&#x2014;that clarify the relationship between inputs and outputs.</p><p>The National Institute of Standards and Technology also provides guidance on explainability, outlining four principles for explainable systems, the most important of which is that they should deliver accompanying justifications or reasons for model outputs [<xref ref-type="bibr" rid="ref42">42</xref>]. The work further distinguishes between self-interpretable models, which are inherently understandable to humans, and post hoc explanations, which are generated by an explainability algorithm to provide insight into otherwise opaque models.</p><p>In summary, interpretability relates to the inherent transparency of the model itself; white-box models, such as decision trees and linear regression, are generally considered interpretable, while explainability refers to the generation of additional information (ie, explanations) that clarify the reasoning behind a model&#x2019;s decision, regardless of the model&#x2019;s inherent transparency. In this work, we adopt the latter definition.</p></sec><sec id="s1-2-2"><title>Objectives of Explainable Artificial Intelligence</title><sec id="s1-2-2-1"><title>Overview</title><p>XAI serves multiple partially overlapping objectives, which vary according to the target application and the perspective of the stakeholder [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. The following categories are directly derived from other studies [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>].</p></sec><sec id="s1-2-2-2"><title>Debugging and Monitoring</title><p>Explanations help AI developers identify model errors, biases, and spurious correlations, thereby revealing opportunities for performance improvement. XAI can also be used to monitor performance drift during deployment, ensuring the model continues to operate as intended over time.</p></sec><sec id="s1-2-2-3"><title>Evaluation and Validation</title><p>XAI enables stakeholders to assess whether a model is appropriate, reliable, and clinically valid for a given application. It supports both predeployment evaluation and ongoing validation, ensuring that model decisions remain aligned with intended clinical outcomes.</p></sec><sec id="s1-2-2-4"><title>Justification and Transparency</title><p>XAI fosters trust among stakeholders by providing context-relevant rationales for model decisions. This includes justifying individual predictions and improving transparency in the decision-making process, allowing clinicians, patients, and regulators to audit and verify the system&#x2019;s outputs.</p></sec><sec id="s1-2-2-5"><title>Improvement and Learning</title><p>Explanations support iterative model refinement through collaboration with domain experts, enhancing alignment with clinical reasoning. XAI can also contribute to the discovery of new domain knowledge, such as identifying previously unknown biomarkers.</p></sec><sec id="s1-2-2-6"><title>Governance and Compliance</title><p>In high-stakes domains such as medicine, XAI facilitates compliance with ethical, legal, and regulatory requirements, including provisions such as the European General Data Protection Regulation &#x201C;right to explanation&#x201D; [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. It enables model auditability, supports liability attribution, and strengthens governance processes.</p></sec></sec></sec><sec id="s1-3"><title>Characteristics of XAI Methods</title><sec id="s1-3-1"><title>Overview</title><p>Traditionally, XAI methods are categorized according to their scope of application (ie, model-specific and model-agnostic). Classical taxonomy groups XAI methods into intrinsic methods, where the model contains native interpretable or explainable components, and post hoc techniques, where a secondary system generates explanations. The resultant explanations provide either local insight for individual sample instances or globally justify model behavior. Accordingly, methods in the literature are categorized in the Results section (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary of XAI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> methods reported in the included studies, classified according to model dependency (model-agnostic vs model-specific), scope of explanation (local vs global), relationship to the predictive model (post hoc vs intrinsic), and explanation modality. The table shows ablation analysis and latent space interpretation dominating the literature, along with Grad-CAM<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">XAI methods</td><td align="left" valign="bottom">Model agnostic/specific</td><td align="left" valign="bottom">Global/local</td><td align="left" valign="bottom">Post hoc/intrinsic</td><td align="left" valign="bottom">Explanation modality</td><td align="left" valign="bottom">Implemented in</td></tr></thead><tbody><tr><td align="left" valign="top">Grad-CAM</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref49">49</xref>]</td></tr><tr><td align="left" valign="top">Guided backpropagation</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref50">50</xref>]</td></tr><tr><td align="left" valign="top">Saliency maps</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref4">4</xref>]</td></tr><tr><td align="left" valign="top">Eigen-CAM<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref10">10</xref>]</td></tr><tr><td align="left" valign="top">SHAP<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">Agnostic</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Tabular</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]</td></tr><tr><td align="left" valign="top">GradientSHAP<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup></td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Tabular/visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref53">53</xref>]</td></tr><tr><td align="left" valign="top">LIME<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">Agnostic</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Tabular/visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref9">9</xref>]</td></tr><tr><td align="left" valign="top">xDMFCC<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup></td><td align="left" valign="top">Agnostic</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual/tabular</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref54">54</xref>]</td></tr><tr><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Global</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Tabular</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]</td></tr><tr><td align="left" valign="top">Simple attention</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Intrinsic</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref52">52</xref>]</td></tr><tr><td align="left" valign="top">Attention rollout</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Local</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref57">57</xref>]</td></tr><tr><td align="left" valign="top">Concept detectors network</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Global</td><td align="left" valign="top">Intrinsic</td><td align="left" valign="top">Concept-level</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref58">58</xref>]</td></tr><tr><td align="left" valign="top">Sinc filters</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Global</td><td align="left" valign="top">Intrinsic</td><td align="left" valign="top">Conceptual (filter shape)</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]</td></tr><tr><td align="left" valign="top">Feature map analysis</td><td align="left" valign="top">Specific</td><td align="left" valign="top">Global</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual (CNN<sup><xref ref-type="table-fn" rid="table1fn8">h</xref></sup> filters)</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]</td></tr><tr><td align="left" valign="top">t-SNE<sup><xref ref-type="table-fn" rid="table1fn9">i</xref></sup></td><td align="left" valign="top">Agnostic</td><td align="left" valign="top">Global</td><td align="left" valign="top">Post hoc</td><td align="left" valign="top">Visual</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>]</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>XAI: explainable artificial intelligence.</p></fn><fn id="table1fn2"><p><sup>b</sup>Grad-CAM: gradient-weighted class activation mapping.</p></fn><fn id="table1fn3"><p><sup>c</sup>Eigen-CAM: Eigen class activation mapping.</p></fn><fn id="table1fn4"><p><sup>d</sup>SHAP: Shapley Additive Explanations.</p></fn><fn id="table1fn5"><p><sup>e</sup>GradientSHAP: gradient Shapley Additive Explanations.</p></fn><fn id="table1fn6"><p><sup>f</sup>LIME: local interpretable model-agnostic explanation.</p></fn><fn id="table1fn7"><p><sup>g</sup>xDMFCC: explainable deep learning mel-frequency cepstral coefficients.</p></fn><fn id="table1fn8"><p><sup>h</sup>CNN: convolutional neural network.</p></fn><fn id="table1fn9"><p><sup>i</sup>t-SNE: t-distributed stochastic neighbor embedding.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s1-3-2"><title>Model-Specific vs Model-Agnostic</title><p>Model-specific methods exploit structural aspects particular to model type or architecture to, in theory, produce higher-quality explanations. For example, Grad-CAM (gradient-weighted class activation mapping) is applicable to convolutional neural networks (CNNs) and takes advantage of their spatial feature maps to highlight input features that are most influential for the output.</p><p>In contrast, model-agnostic methods, such as SHAP (Shapley Additive Explanations), are generalizable to any ML model regardless of architecture. This universality enables versatile implementation across many applications. Unlike model-specific techniques that benefit from internal model architecture, model-agnostic methods often exhibit lower fidelity and efficiency, especially when dealing with high-dimensional data.</p></sec><sec id="s1-3-3"><title>Local vs Global</title><p>Local explainability methods aim to clarify the reasoning behind a model&#x2019;s prediction for a single input instance. These explanations are instance-specific and do not describe the model&#x2019;s overall decision-making process. For example, visualization techniques such as Class Activation Maps can generate heatmaps that indicate the regions of an input spectrogram most relevant to the model&#x2019;s prediction. Local explanations are particularly valuable in clinical decision support scenarios where individual case justification is critical.</p><p>Global explainability methods, on the other hand, aim to describe the model&#x2019;s general behavior across the entire dataset. This can involve identifying the most influential features for distinguishing between classes, mapping decision boundaries, or summarizing feature interactions. Such methods are crucial for understanding systematic model biases, validating clinical relevance, and ensuring that the model&#x2019;s logic aligns with domain knowledge.</p></sec><sec id="s1-3-4"><title>Intrinsic vs Post Hoc</title><p>As mentioned earlier, intrinsic (or self-) interpretability refers to the degree to which a model&#x2019;s decision-making process is transparent and human-understandable by design. White-box models are examples of intrinsically interpretable models. However, they often underperform compared to more complex black-box architectures such as deep neural networks.</p><p>Post hoc methods, in contrast, are applied after the training process is completed to explain the model&#x2019;s behavior without altering its internal mechanics. These techniques, ranging from saliency maps to perturbation-based analyses, are particularly prevalent for explaining black-box models.</p></sec></sec><sec id="s1-4"><title>XAI in Medicine</title><p>With recent advances in medical AI, the implementation of XAI has become increasingly critical to ensure that AI models are reliable, ethical, legally compliant, and clinically aligned, particularly in high-stakes environments such as health care. The literature contains numerous surveys discussing the role of XAI across diverse medical applications.</p><p>Several recent reviews have broadly examined the adoption of XAI in medicine, emphasizing its potential to improve transparency, foster trust, and facilitate regulatory compliance [<xref ref-type="bibr" rid="ref61">61</xref>-<xref ref-type="bibr" rid="ref65">65</xref>]. These works cover a variety of data modalities, including medical imaging, electronic health records (EHR), genomics, and time-series analysis. Commonly reported methods include model-agnostic techniques, most notably LIME (local interpretable model-agnostic explanations) and SHAP, alongside post hoc visualization approaches. Nonetheless, these reviews consistently highlight persistent challenges relating to faithfulness, stability, and standardized evaluation of explanations.</p><p>Domain-specific surveys further illustrate these trends. For example, van der Velden et al [<xref ref-type="bibr" rid="ref66">66</xref>] reviewed over 200 studies applying XAI to medical imaging, noting the predominance of visualization techniques, followed by perturbation-based, textual, and example-based approaches. While visual explanations have been extensively validated, the authors stress the need for equivalent validation of textual and example-based methods. Muhammad and Bendechache [<xref ref-type="bibr" rid="ref67">67</xref>] similarly highlight the interpretive ambiguity and perturbation sensitivity of visual methods. In the context of medical time-series data, Caterson et al [<xref ref-type="bibr" rid="ref68">68</xref>] present a scoping review of XAI for EHR, finding feature attribution methods to be the most widely used. Salih et al [<xref ref-type="bibr" rid="ref69">69</xref>] review XAI in cardiology for ECG (electrocardiogram) and EHR, reporting the frequent use of SHAP and Grad-CAM, followed by LIME, but also note that nearly half (47%) of the studies did not use any formal evaluation of the explanations produced. In the mental health and psychiatry domain, Joyce et al [<xref ref-type="bibr" rid="ref70">70</xref>] reviewed XAI applied to neuroimaging, interview transcripts, and physiological data, underscoring the importance of human-centered design for producing clinically useful explanations beyond raw saliency maps. With respect to clinical audio, Chen et al [<xref ref-type="bibr" rid="ref16">16</xref>] reviewed XAI approaches for vocal biomarker-based lung disease detection, discussing several issues, including the utility of explanations defined in terms of informativeness and user understanding as being an important criterion for evaluating explanations.</p><p>Despite the breadth of these surveys, to the best of our knowledge, no systematic reviews have comprehensively examined the use of XAI for clinical voice and speech biomarkers more broadly. This gap highlights the need for a dedicated synthesis in this domain, particularly given the unique challenges and interpretive demands of audio-based clinical decision support systems.</p></sec><sec id="s1-5"><title>Unique Challenges of Clinical Audio for XAI</title><sec id="s1-5-1"><title>Audio Abstraction and Representation</title><p>In the time domain, audio is represented as waveforms that fully capture the acoustic signal but are difficult for humans to visually interpret directly. Consequently, audio is often transformed into spectrograms or mel-frequency cepstral coefficients (MFCCs). While clinicians such as speech-language pathologists and audiologists are trained to interpret spectrograms, visual explanations based on time-frequency representations remain suboptimal, particularly for other stakeholder groups. This issue is further highlighted when explanations are inconsistent across samples or vary substantially between different explainability methods, undermining their reliability and interpretive stability [<xref ref-type="bibr" rid="ref71">71</xref>,<xref ref-type="bibr" rid="ref72">72</xref>]. This can be attributed to human auditory perception of sound, rather than of the visual [<xref ref-type="bibr" rid="ref73">73</xref>]. MFCCs, a compact nonlinear &#x201C;spectrum-of-the-spectrum&#x201D; representation of audio, are even more obscure to clinicians, limiting their clinical interpretability [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref74">74</xref>]. Additionally, speaker characteristics are usually spread across multiple frequency bands, making the problem of localizing relevant frequency information too broad to solve effectively with traditional vision-based XAI methods [<xref ref-type="bibr" rid="ref75">75</xref>]. Nonetheless, assuming relevant spatiotemporal regions are identifiable through traditional vision-based methods, understanding why these regions are important remains obscure, at least partially, and requires further analysis to answer such questions. Ultimately, the nonvisual perceptual nature of voice and speech makes current visual explanations a nonideal solution to the overarching problem of explainability and trustworthy clinical AI.</p></sec><sec id="s1-5-2"><title>Temporal Dynamics</title><p>Unlike medical images, where both axes carry spatial meaning, audio has an inherently temporal structure. Identifying when a clinically relevant event occurs (eg, a stutter or pause) is as important as identifying which acoustic features are involved. This mismatch makes it difficult for visualization-based methods such as saliency maps or Grad-CAM, designed for spatial data, to yield clinically actionable explanations in audio [<xref ref-type="bibr" rid="ref73">73</xref>,<xref ref-type="bibr" rid="ref75">75</xref>].</p></sec><sec id="s1-5-3"><title>Annotation Scarcity</title><p>Phonetic, prosodic, disfluency, and voice quality annotations are essential for aligning model explanations with known biomarkers; however, such granular annotations, typically performed by multiple trained experts, are resource-intensive and, consequently, scarce. Without such annotations, validation of model explanations&#x2019; alignment to medically grounded features becomes increasingly daunting, limiting their utility in practice.</p><p>In summary, these challenges highlight that explainability methods developed for domains such as imaging or EHR are not directly transferable to clinical audio. Audio-native evaluation frameworks and human-centered approaches are needed to ensure that XAI methods drive actionable clinical insights.</p></sec></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Systematic Review Search Strategy</title><p>We conducted our systematic review according to the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) guideline. A review protocol was prepared, but not registered before this study. Our goal was to survey papers that perform explainability or interpretability analyses of deep learning models applied to clinical voice and speech audio. A comprehensive search was carried out across IEEE Xplore, ACM, Scopus, PubMed, Web of Science, and Nature.</p><p>The following search terms were selected across three domains: explainability and interpretability, voice, speech, and acoustics, and clinical and health care context. These terms were used to create a query, which was adapted for each database. The exact search strings for each database can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The search was conducted across all databases on February 7, 2025, except ACM, which was searched on February 11, 2025. The search included studies published between January 1, 2015, and the date of the search. In total, 1426 records were retrieved, which were reduced to 1348 after removing 78 duplicates.</p></sec><sec id="s2-2"><title>Eligibility Criteria</title><p>Papers were selected if they met the following criteria of being (1) focused on deep learning models applied to voice and/or speech data for health care applications, (2) applied explainability or interpretability techniques to the model, (3) reported empirical results or experimental validation, and (4) published in peer-reviewed journals or conferences.</p><p>Papers were excluded for any of the following reasons: (1) explainability and/or interpretability analysis performed for non&#x2013;deep learning models; (2) used audio transcripts only (ie, purely natural language processing methods); (3) used nonclinical audio datasets; (4) not published in English; and (5) reviews, theses, or non&#x2013;peer-reviewed papers.</p></sec><sec id="s2-3"><title>Study Selection</title><p>The results of the search strategy were exported to the RAYYAN tool [<xref ref-type="bibr" rid="ref76">76</xref>] for screening. One reviewer conducted the initial screening of titles and abstracts against the eligibility criteria, and a second reviewer independently verified the filtering decisions. Papers were excluded only if there was clear evidence in the title or abstract that the paper did not meet the criteria; otherwise, they proceeded to full-text review. The screening process resulted in 187 papers deferred for full-text review, of which 2 papers were retracted, and 2 papers were inaccessible. The final set for full-text review was 183 papers.</p><p>Full-text screening was performed by two reviewers, with deliberation and discussion used to resolve disagreements. Reasons for exclusion at this stage were documented. The final included set consisted of 30 papers that met all eligibility criteria.</p></sec><sec id="s2-4"><title>Data Extraction</title><p>Data extraction was carried out by a single reviewer using a standardized spreadsheet. A second reviewer reviewed the extracted data and deliberated on any uncertainties or discrepancies to ensure accuracy and completeness. The following information was extracted from the final set of included papers: (1) bibliographic information (authors, and year), (2) dataset information (clinical condition, number of subjects, and acoustic tasks), (3) clinical application, (4) deep learning methodology (model, hyperparameters, and training or validation strategy), (5) model performance, (6) explainability or interpretability strategy, (7) insights gained from explainability or interpretability analysis, and (8) validation or support for the explainability or interpretability results.</p></sec><sec id="s2-5"><title>Data Synthesis</title><sec id="s2-5-1"><title>Overview</title><p>Data were synthesized across explainability method type, explainability input-output characteristics, explainability validation strategies, and stakeholder involvement.</p></sec><sec id="s2-5-2"><title>Explainability Methods</title><p>As existing surveys of XAI demonstrate a lack of consensus on a unified XAI taxonomy, we adopted a set of commonly used XAI method categories reported across prior surveys and taxonomy literature, including work in medical XAI [<xref ref-type="bibr" rid="ref77">77</xref>-<xref ref-type="bibr" rid="ref79">79</xref>]. Our work does not aim to propose a new categorization or taxonomy, as this is outside the scope of this study. Accordingly, explainability methods in the included studies were grouped using the following widely adopted categories:</p><list list-type="bullet"><list-item><p>Gradient saliency-based methods: these methods derive explanations by analyzing gradients of the model output with respect to the input or intermediate feature maps, characterizing the influence of different parts of the input signal on model predictions.</p></list-item><list-item><p>Perturbation-based techniques: these methods generate explanations by systematically modifying, masking, or removing parts of the input or feature space and observing the resulting change in model output.</p></list-item><list-item><p>Surrogate model-based methods: this category includes methods that use simpler models to approximate the local behavior of complex models.</p></list-item><list-item><p>Model-internal representation analysis: this category encompasses methods that provide explainability through inspection and/or visualization of a model&#x2019;s internal structures.</p></list-item><list-item><p>Concept-based methods: these methods include techniques that provide explanations by relating model behavior to predefined, higher-level, semantically human-understandable concepts or clinically meaningful attributes.</p></list-item><list-item><p>Attention-based explanations: this family of methods encompasses techniques that rely on attention weights to generate explanations.</p></list-item></list></sec><sec id="s2-5-3"><title>Explainability Input-Output Representation</title><p>For each included study, we recorded the input representation over which explainability was applied and the corresponding form of the explainability output, as these factors directly determine the interpretability and modality of explanations in clinical audio systems. Input representation refers to the signal or feature representation interrogated by the explainability method (which may differ from that used by the predictive model), while output characteristics refer to how explanations were presented (eg, explanation modality). These attributes were extracted to support the synthesis of explainability practices across studies and to contextualize differences in explanation form, granularity, and clinical interpretability.</p></sec><sec id="s2-5-4"><title>Explainability Validation Strategies</title><p>In this work, an explainability validation strategy is defined as any technical, model-centric procedure used to assess the faithfulness, consistency, or robustness of explainability outputs with respect to the underlying model behavior. This definition explicitly refers to model-centered validation and excludes human judgment or interpretive assessment.</p><p>The following explainability validation strategies were recorded for the included studies:</p><list list-type="bullet"><list-item><p>Perturbation-based validation: input features, time segments, or frequency regions identified as salient by the explainability method are systematically modified, masked, or removed to assess the resulting impact on model predictions or performance.</p></list-item><list-item><p>Ground-truth or annotation-based verification: salient regions or features identified by the explainability method are compared against externally defined references, such as expert annotations, labeled phonetic events, disorder-specific acoustic markers, or task-related temporal boundaries, when used to evaluate correspondence with model behavior.</p></list-item><list-item><p>Stability or cross-dataset consistency analysis: explanation patterns are examined across different datasets, cohorts, recording conditions, or evaluation splits to assess the robustness and consistency of explanations under data variation or dataset shift.</p></list-item></list></sec><sec id="s2-5-5"><title>Domain-Specific Explanation Patterns</title><p>To support structured synthesis of explainability findings, study-derived insights were grouped and analyzed according to the clinical application domain targeted by each study (as defined by each study&#x2019;s objective). Explainability outputs were examined within each domain to identify recurring explanation patterns. We also noted potential risk of circular validation, particularly in cases where multiple studies relied on the same dataset, which may limit the generalizability of observed explanation trends.</p></sec><sec id="s2-5-6"><title>Human-Centered Analysis and Stakeholder Alignment</title><p>Beyond technical explainability validation, human-centered evaluation is critical for assessing explanation quality, the clinical relevance, and practical utility of explainability methods in real-world health care settings. Accordingly, stakeholder alignment was assessed separately from technical validation.</p><p>Stakeholder alignment was defined as explicit involvement of domain experts, such as clinicians, speech-language pathologists, or regulatory stakeholders, in interpreting, evaluating, or providing feedback on explainability outputs. For each included study, the presence or absence of human-in-the-loop evaluation was recorded, along with the reported form of involvement.</p></sec></sec><sec id="s2-6"><title>Risk of Bias and Applicability Assessment</title><p>A formal quality and risk-of-bias assessment was conducted to evaluate the methodological robustness of prediction models estimating health-related outcomes. PROBAST+AI (Prediction Model Risk of Bias Assessment Tool for Artificial Intelligence) [<xref ref-type="bibr" rid="ref80">80</xref>], an extension of the original PROBAST [<xref ref-type="bibr" rid="ref81">81</xref>] framework developed to address advances in AI- and ML-based prediction models, was used for this purpose.</p><p>A review-level PICOTS (Population; Index Model; Comparator; Outcome; Timing; Setting) framework was predefined to guide the scope and intended applicability of the assessment. The tool was applied to studies that developed or evaluated prediction models estimating health-related outcomes. Studies that did not fit these criteria were not suited for quality assessment by the PROBAST+AI tool. Each eligible study was classified as model development, model evaluation, or both. In accordance with PROBAST+AI guidance, development and evaluation components were assessed separately.</p><p>The tool evaluates four domains: (1) participants and data sources: addressing data origin, collection procedures, and representativeness; (2) predictors: examining input definition, preprocessing, and availability at the time of intended use; (3) outcome: assessing outcome definition, measurement, and timing; and (4) analysis: evaluating sample size adequacy, validation strategy, handling of missing data, risk of data leakage, and mitigation of overfitting.</p><p>Domains were rated as low, high, or unclear for development (quality concern) and evaluation (risk of bias), respectively. Applicability was assessed for the first three domains only and not for the analysis domain. The fourth domain does not include applicability considerations under the PROBAST+AI framework, as applicability refers to the assessor&#x2019;s review question or intended use of a model, including the target population and setting. In accordance with PROBAST+AI guidance, if at least one domain was rated high, the overall judgment was classified as high. Detailed signaling criteria are described in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref82">82</xref>].</p><p>The assessment was independently conducted by two reviewers, with disagreements resolved by discussion. The PICOTS framework definition and domain-specific ratings are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview of Included Studies</title><p>A total of 30 studies met the inclusion and exclusion criteria. Although the search spanned publications from 2015 to 2025, all included studies were published from 2020 onward, except for a single paper. This can be explained by the relatively recent adoption of deep learning-based approaches and XAI for voice and speech in health care research. A PRISMA flow diagram summarizing the selection process is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. The results are summarized in <xref ref-type="table" rid="table2">Table 2</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>PRISMA flowchart. DL: deep learning; ML: machine learning; NLP: natural language processing; PRISMA: Preferred Reporting Items for Systematic Reviews and Meta-Analyses; XAI: explainable artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e83790_fig01.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Overview of the 30 included studies applying explainable and interpretable deep learning methods to clinical voice and speech analysis. The table summarizes clinical application domains, datasets, modeling approaches, and explainability techniques. Across studies, post hoc, local explainability methods predominated, with latent representation analysis, most commonly t-SNE<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, being the most frequently used approach. Overall, explanations were primarily interpreted qualitatively, with limited quantitative validation, external consistency assessment, or human-centered evaluation reported.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Work</td><td align="left" valign="bottom">Application</td><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Model performance</td><td align="left" valign="bottom">XAI<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> method</td><td align="left" valign="bottom">XAI insight</td><td align="left" valign="bottom">Quantitative XAI output validation</td></tr></thead><tbody><tr><td align="left" valign="top">Shaikh et al [<xref ref-type="bibr" rid="ref4">4</xref>]</td><td align="left" valign="top">Voice disorder classification</td><td align="left" valign="top">MLP<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup>, 1D-CNN<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">97.1% accuracy, 99.8% recall, 97% F1</td><td align="left" valign="top">Saliency maps</td><td align="left" valign="top">Top five LLD<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup> features were extracted per condition; saliency maps emphasized low amplitude, high-frequency spectrogram regions, though no consistent pattern emerged.</td><td align="left" valign="top">Cross-dataset evaluation</td></tr><tr><td align="left" valign="top">Gupta et al [<xref ref-type="bibr" rid="ref50">50</xref>]</td><td align="left" valign="top">Multiclass dysarthria severity classification</td><td align="left" valign="top">ResNet-14<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">98.8% accuracy, 98.9% F1</td><td align="left" valign="top">Guided backpropagation</td><td align="left" valign="top">For low-severity cases, the model focused on high-energy vowel regions (clearer phoneme boundaries); for high-severity cases, activations were diffuse, consistent with phonatory instability and temporal smearing.</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Fu et al [<xref ref-type="bibr" rid="ref48">48</xref>]</td><td align="left" valign="top">Schizophrenia vs healthy (binary)</td><td align="left" valign="top">Sch-Net (CNN<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup> with skip connections, CBAM<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup>)</td><td align="left" valign="top">97.68% accuracy, 99.1% recall, 97.7% F1</td><td align="left" valign="top">Grad-CAM<sup><xref ref-type="table-fn" rid="table2fn10">j</xref></sup></td><td align="left" valign="top">Reduced high-frequency energy focus (<italic>&#x003C;</italic>5 kHz) and emphasis on low-frequency formant stripes (&#x003C;2 kHz), suggesting articulation errors (voiced instead of unvoiced consonants) aligned with blunted affect.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Lee et al [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="top">Postoperative vocal recovery (GRBAS<sup><xref ref-type="table-fn" rid="table2fn11">k</xref></sup>)</td><td align="left" valign="top">EfficientNet-B4 (CNN)+ LSTM<sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup></td><td align="left" valign="top">0.379 RMSE<sup><xref ref-type="table-fn" rid="table2fn13">m</xref></sup> (regression), 91.8% AUC<sup><xref ref-type="table-fn" rid="table2fn14">n</xref></sup> (breathiness, binary)</td><td align="left" valign="top">Grad-CAM</td><td align="left" valign="top">Different activation patterns by GRBAS level: attention to low bands (0&#x2010;2 kHz, formants), mid bands (2&#x2010;4 kHz, harmonics/noise), and temporal regions (pauses/breathy segments).</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Peng et al [<xref ref-type="bibr" rid="ref3">3</xref>]</td><td align="left" valign="top">Multiclass voice disorder classification</td><td align="left" valign="top">OpenL3+ SVM<sup><xref ref-type="table-fn" rid="table2fn15">o</xref></sup></td><td align="left" valign="top">99.5% accuracy, 99.6% recall, 99.6% F1</td><td align="left" valign="top">Grad-CAM, t-SNE</td><td align="left" valign="top">Disorder-specific band focus: healthy in low-frequency regions; hyperkinetic dysphonia in both high and low bands; reflux laryngitis in high bands; hypokinetic dysphonia weak across bands.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Rojas et al [<xref ref-type="bibr" rid="ref47">47</xref>]</td><td align="left" valign="top">Mild traumatic brain injury detection</td><td align="left" valign="top">ResNet</td><td align="left" valign="top">67.4% accuracy, 76.1% recall, 69.9% F1</td><td align="left" valign="top">Grad-CAM</td><td align="left" valign="top">Model down-weighted low frequencies and emphasized high-frequency regions for mTBI<sup><xref ref-type="table-fn" rid="table2fn16">p</xref></sup> predictions.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Shen and Zhang [<xref ref-type="bibr" rid="ref49">49</xref>]</td><td align="left" valign="top">Speech disfluency detection</td><td align="left" valign="top">Multiadversarial neural network</td><td align="left" valign="top">58.7% UAR<sup><xref ref-type="table-fn" rid="table2fn17">q</xref></sup></td><td align="left" valign="top">Time-related Grad-CAM, t-SNE</td><td align="left" valign="top">Highlighted frames aligned with annotated disfluencies; distinct disfluency types exhibited different temporal Grad-CAM patterns; results supported wav2vec capturing meaningful temporal cues.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Jeong et al [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">Parkinson disease classification</td><td align="left" valign="top">AST<sup><xref ref-type="table-fn" rid="table2fn18">r</xref></sup>, EfficientNet-based</td><td align="left" valign="top">92.15% accuracy, 91.5% recall, 92.15% F1</td><td align="left" valign="top">Eigen-CAM<sup><xref ref-type="table-fn" rid="table2fn19">s</xref></sup></td><td align="left" valign="top">Emphasis on higher-frequency bands associated with muffled/degraded speech in PD<sup><xref ref-type="table-fn" rid="table2fn20">t</xref></sup>.</td><td align="left" valign="top">Annotation-based verification</td></tr><tr><td align="left" valign="top">Schultebraucks et al [<xref ref-type="bibr" rid="ref51">51</xref>]</td><td align="left" valign="top">PTSD<sup><xref ref-type="table-fn" rid="table2fn21">u</xref></sup>/major depressive disorder (binary and severity) at 1 month</td><td align="left" valign="top">Multimodal DNN<sup><xref ref-type="table-fn" rid="table2fn22">v</xref></sup></td><td align="left" valign="top">90% AUC, 84% recall, 83 F1 (PTSD), 86% AUC, 82% recall, 82% F1 (depression)</td><td align="left" valign="top">SHAP<sup><xref ref-type="table-fn" rid="table2fn23">w</xref></sup></td><td align="left" valign="top">Key predictors: reduced pitch/intensity (voice), negative affect/self-focus (language), flat affect (face).</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Ditthapron et al [<xref ref-type="bibr" rid="ref53">53</xref>]</td><td align="left" valign="top">TBI<sup><xref ref-type="table-fn" rid="table2fn24">x</xref></sup> vs healthy (binary)</td><td align="left" valign="top">pSinc+ cGRU<sup><xref ref-type="table-fn" rid="table2fn25">y</xref></sup></td><td align="left" valign="top">83.8% balanced accuracy, 92.9% recall, 85.1% F1</td><td align="left" valign="top">GradientSHAP<sup><xref ref-type="table-fn" rid="table2fn26">z</xref></sup>, Sinc filters</td><td align="left" valign="top">High attribution to filler words and high-frequency spectral patterns; formants were salient for TBI detection.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Zhang et al [<xref ref-type="bibr" rid="ref9">9</xref>]</td><td align="left" valign="top">Dementia detection</td><td align="left" valign="top">BiLSTM<sup><xref ref-type="table-fn" rid="table2fn27">aa</xref></sup>+ multihead attention (audio); DistilBERT<sup><xref ref-type="table-fn" rid="table2fn28">ab</xref></sup>+1D-CNN+ cross-modal attention</td><td align="left" valign="top">80.8% accuracy, 77.57% recall, 83.23% F1 (external validation)</td><td align="left" valign="top">SHAP (LLDs), LIME<sup><xref ref-type="table-fn" rid="table2fn29">ac</xref></sup> (text/audio)</td><td align="left" valign="top">Linguistic features (eg, noun phrase rate, word rate) were most predictive among LLDs; AD<sup><xref ref-type="table-fn" rid="table2fn30">ad</xref></sup> speech showed more fillers/pronouns/function words, lower energy, and slower rate; attention emphasized disfluent/low-energy segments.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Guti&#x00E9;rrez-Seraf&#x00ED;n et al [<xref ref-type="bibr" rid="ref54">54</xref>]</td><td align="left" valign="top">Brain lesion detection</td><td align="left" valign="top">CNN</td><td align="left" valign="top">73% accuracy, 75% recall, 75% F1</td><td align="left" valign="top">xDMFCCs<sup><xref ref-type="table-fn" rid="table2fn31">ae</xref></sup> (LIME)</td><td align="left" valign="top">MFCC<sup><xref ref-type="table-fn" rid="table2fn32">af</xref></sup>-1/2 (energy/clarity) were most important; controls showed earlier/clearer articulation with more discriminative energy in higher-order MFCCs; patients showed delayed onset, slower articulation, and longer phoneme duration.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Liu et al [<xref ref-type="bibr" rid="ref18">18</xref>]</td><td align="left" valign="top">Automatic dysarthric speech recognition</td><td align="left" valign="top">TDNN-HMM<sup><xref ref-type="table-fn" rid="table2fn33">ag</xref></sup>; CTC<sup><xref ref-type="table-fn" rid="table2fn34">ah</xref></sup>; LAS<sup><xref ref-type="table-fn" rid="table2fn35">ai</xref></sup>, encoder&#x2013;decoder</td><td align="left" valign="top">25.5% WER<sup><xref ref-type="table-fn" rid="table2fn36">aj</xref></sup></td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">TDNN<sup><xref ref-type="table-fn" rid="table2fn37">ak</xref></sup> outperformed CTC on moderate-severe dysarthria (highlighting the importance of temporal dependencies); speaker adaptation substantially reduced WER (per-speaker customization is beneficial).</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Huang et al [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">Schizophrenia severity (classification/regression)</td><td align="left" valign="top">Transformer embeddings (BERT<sup><xref ref-type="table-fn" rid="table2fn38">al</xref></sup>, ELECTRA<sup><xref ref-type="table-fn" rid="table2fn39">am</xref></sup>, TERA<sup><xref ref-type="table-fn" rid="table2fn40">an</xref></sup>)+ BiLSTM + FC<sup><xref ref-type="table-fn" rid="table2fn41">ao</xref></sup></td><td align="left" valign="top">88% accuracy, 80% F1 (severity classification)</td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">BERT is most crucial for the TLC<sup><xref ref-type="table-fn" rid="table2fn42">ap</xref></sup> scale and for PANSSs<sup><xref ref-type="table-fn" rid="table2fn43">aq</xref></sup> except PANSS-General; ELECTRA contributed moderately; TERA was low for TLC but important for PANSS, especially positive symptoms and general psychopathology.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Herath et al [<xref ref-type="bibr" rid="ref82">82</xref>]</td><td align="left" valign="top">Aphasia severity classification</td><td align="left" valign="top">DNN</td><td align="left" valign="top">98.5% accuracy, 97.3% recall, 97.4% F1</td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">MFCC-DNN performed best; ZCR-DNN<sup><xref ref-type="table-fn" rid="table2fn44">ar</xref></sup> performed worst.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">He et al [<xref ref-type="bibr" rid="ref55">55</xref>]</td><td align="left" valign="top">Schizophrenia detection</td><td align="left" valign="top">WNSA-Net<sup><xref ref-type="table-fn" rid="table2fn45">as</xref></sup></td><td align="left" valign="top">98.16% accuracy, 98.72% F1 (TORGO<sup><xref ref-type="table-fn" rid="table2fn46">at</xref></sup>)</td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">Wideband and narrowband spectrograms provided complementary information; dilated convolutions captured micro-level (pitch, formants) and macro-level (rate, prosody) cues.</td><td align="left" valign="top">Cross-dataset evaluation</td></tr><tr><td align="left" valign="top">Lahoti et al [<xref ref-type="bibr" rid="ref11">11</xref>]</td><td align="left" valign="top">Parkinson disease detection</td><td align="left" valign="top">Multihead attention BiLSTM</td><td align="left" valign="top">85.02% accuracy, 84.9% F1</td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">Augmenting cepstral features with shifted delta cepstra improved performance over single-frequency filtering cepstral coefficients alone, highlighting long-term temporal dependencies for PD detection.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Zhang et al [<xref ref-type="bibr" rid="ref15">15</xref>]</td><td align="left" valign="top">Depression detection</td><td align="left" valign="top">Wav2vec+1D-CNN+ LSTM</td><td align="left" valign="top">90.9% accuracy, 90.7% F1, 95.6% AUC (binary)</td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">Models with self-attention performed better; wav2vec embeddings were superior; 7-second segments worked best, suggesting emotion is concentrated in short spans.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Laguarta and Subirana [<xref ref-type="bibr" rid="ref12">12</xref>]</td><td align="left" valign="top">Alzheimer disease detection</td><td align="left" valign="top">Open voice brain model (GNN<sup><xref ref-type="table-fn" rid="table2fn47">au</xref></sup>)</td><td align="left" valign="top">93.3% accuracy, 95% AUC</td><td align="left" valign="top">Ablation studies</td><td align="left" valign="top">Memory/fluency features dominated early-stage AD detection, with sentiment/prosody also contributing; AD often showed high saliency in respiratory control, disfluency, or memory-related patterns.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Joshy and Rajan [<xref ref-type="bibr" rid="ref56">56</xref>]</td><td align="left" valign="top">Dysarthria severity classification</td><td align="left" valign="top">DNN, CNN, GRU<sup><xref ref-type="table-fn" rid="table2fn48">av</xref></sup></td><td align="left" valign="top">93.97% accuracy (speaker dependent), 70.52% accuracy (speaker independent)</td><td align="left" valign="top">Ablation studies; t-SNE</td><td align="left" valign="top">MFCCs performed best in a speaker-dependent setup; CQCCs<sup><xref ref-type="table-fn" rid="table2fn49">aw</xref></sup> generalized better to unseen speakers; articulatory features were strongest among disorder-specific sets but weaker in a speaker-independent setup; MFCC-based i-vectors showed clearer class clustering in t-SNE.</td><td align="left" valign="top">Multidataset evaluation</td></tr><tr><td align="left" valign="top">Yue et al [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">Automatic dysarthric speech recognition</td><td align="left" valign="top">Multistream CNN + LiGRU<sup><xref ref-type="table-fn" rid="table2fn50">ax</xref></sup></td><td align="left" valign="top">30.3% WER (dysarthric), 11% WER (typical)</td><td align="left" valign="top">Ablation; CNN filter analysis; t-SNE</td><td align="left" valign="top">Best WER resulted from combining spectrogram magnitude with vocal tract and excitation streams; speed perturbation without F0 fixing improved WER; filters fed with vocal-tract signals emphasized low quefrencies, whereas excitation filters suppressed them; t-SNE showed progressive dysarthric/typical separation and reduced gender clustering over training.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Wang et al [<xref ref-type="bibr" rid="ref52">52</xref>]</td><td align="left" valign="top">Auditory verbal hallucination detection</td><td align="left" valign="top">Uni-modal BiGRU<sup><xref ref-type="table-fn" rid="table2fn51">ay</xref></sup>; multimodal self-attention DNN</td><td align="left" valign="top">84% F1 (overall), 78% F1 (audio, text)</td><td align="left" valign="top">Simple attention visualization, SHAP</td><td align="left" valign="top">Attention prioritized clauses describing distress, influence, or interference, aligning higher weights with higher auditory verbal hallucination severity.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Lau et al [<xref ref-type="bibr" rid="ref57">57</xref>]</td><td align="left" valign="top">Voice disorder detection</td><td align="left" valign="top">AST</td><td align="left" valign="top">81.9% UAR, 91.1% AUC</td><td align="left" valign="top">t-SNE, attention rollout</td><td align="left" valign="top">Model focused on specific phonemes (eg, /ɔ/ and the segment &#x201C;/e/ /s/ /i/ /n/&#x201D;) rather than merely high-energy regions.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Abderrazek et al [<xref ref-type="bibr" rid="ref58">58</xref>]</td><td align="left" valign="top">Head and neck cancer intelligibility</td><td align="left" valign="top">CNN</td><td align="left" valign="top">0.91 PCC<sup><xref ref-type="table-fn" rid="table2fn52">az</xref></sup></td><td align="left" valign="top">Concept detector network</td><td align="left" valign="top">No neurons detected phonetic features in the first dense layer; from subsequent layers to output, the number of phonetic feature detectors increased by a factor of 1.75.</td><td align="left" valign="top">Cross-dataset evaluation</td></tr><tr><td align="left" valign="top">Mathad et al [<xref ref-type="bibr" rid="ref7">7</xref>]</td><td align="left" valign="top">Hypernasality assessment (children with cleft palate)</td><td align="left" valign="top">DNN</td><td align="left" valign="top">0.797 PCC</td><td align="left" valign="top">Concept detector network</td><td align="left" valign="top">A DNN nasality model estimated posterior probabilities for nasal consonants, oral consonants, nasalized vowels, and oral vowels; these were combined into an objective hypernasality measure that quantified detected nasality against expected nasality per phrase.</td><td align="left" valign="top">Cross-dataset evaluation</td></tr><tr><td align="left" valign="top">Hung et al [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="top">Voice disorder classification</td><td align="left" valign="top">SincNet (CNN-based)</td><td align="left" valign="top">83.3% accuracy, 77.31% UAR</td><td align="left" valign="top">Sinc filter analysis, t-SNE</td><td align="left" valign="top">SincNet filters emphasized F1/F2 more clearly than standard CNN filters, preserving formant structure and energy in 500&#x2010;3000 Hz bands.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Vasquez-Correa et al [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">Parkinson disease assessment</td><td align="left" valign="top">CNN</td><td align="left" valign="top">97.6% accuracy, 98.7% AUC (multimodal)</td><td align="left" valign="top">Feature map analysis</td><td align="left" valign="top">Feature maps showed filters highlighting speech transitions (syllable onsets/offsets); many filters in layers 2 and 4 differentiated PD from healthy controls.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Lee et al [<xref ref-type="bibr" rid="ref60">60</xref>]</td><td align="left" valign="top">ASD<sup><xref ref-type="table-fn" rid="table2fn53">ba</xref></sup> detection in infants</td><td align="left" valign="top">BiLSTM with autoencoder</td><td align="left" valign="top">68.18% accuracy, 65.1% UAR, 54.57% F1</td><td align="left" valign="top">t-SNE</td><td align="left" valign="top">Autoencoder embeddings yielded clearer ASD vs TD<sup><xref ref-type="table-fn" rid="table2fn54">bb</xref></sup> separation than eGeMAPS<sup><xref ref-type="table-fn" rid="table2fn55">bc</xref></sup> with BiLSTM.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Geng et al [<xref ref-type="bibr" rid="ref19">19</xref>]</td><td align="left" valign="top">Automatic dysarthric speech recognition</td><td align="left" valign="top">TDNN; conformer</td><td align="left" valign="top">25.5% WER</td><td align="left" valign="top">t-SNE</td><td align="left" valign="top">SVD<sup><xref ref-type="table-fn" rid="table2fn56">bd</xref></sup>-based spectrotemporal deep embeddings showed better separation of dysarthric vs typical speech than i-vectors/x-vectors.</td><td align="left" valign="top">Cross-language, multidataset evaluation</td></tr><tr><td align="left" valign="top">Kim et al [<xref ref-type="bibr" rid="ref59">59</xref>]</td><td align="left" valign="top">Laryngeal disease classification</td><td align="left" valign="top">ResNet-50</td><td align="left" valign="top">92.15% accuracy, 91.53% recall, 92.15% F1</td><td align="left" valign="top">t-SNE</td><td align="left" valign="top">Pooled CNN features for benign disease overlapped with cancer and vocal cord paralysis, explaining reduced multiclass performance.</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>t-SNE: t-distributed stochastic neighbor embedding.</p></fn><fn id="table2fn2"><p><sup>b</sup>XAI: explainable artificial intelligence.</p></fn><fn id="table2fn3"><p><sup>c</sup>MLP: multilayer perceptron.</p></fn><fn id="table2fn4"><p><sup>d</sup>1D-CNN: one-dimensional convolutional neural network.</p></fn><fn id="table2fn5"><p><sup>e</sup>LLD: low-level descriptor.</p></fn><fn id="table2fn6"><p><sup>f</sup>ResNet-14: residual network.</p></fn><fn id="table2fn7"><p><sup>g</sup>Not available.</p></fn><fn id="table2fn8"><p><sup>h</sup>CNN: convolutional neural network.</p></fn><fn id="table2fn9"><p><sup>i</sup>CBAM: convolutional block attention module.</p></fn><fn id="table2fn10"><p><sup>j</sup>Grad-CAM: gradient-weighted class activation mapping.</p></fn><fn id="table2fn11"><p><sup>k</sup>GRBAS: Grade, Roughness, Breathiness, Asthenia, Strain.</p></fn><fn id="table2fn12"><p><sup>l</sup>LSTM: long short-term memory.</p></fn><fn id="table2fn13"><p><sup>m</sup>RMSE: root mean squared error.</p></fn><fn id="table2fn14"><p><sup>n</sup>AUC: area under the curve.</p></fn><fn id="table2fn15"><p><sup>o</sup>SVM: support vector machine.</p></fn><fn id="table2fn16"><p><sup>p</sup>mTBI: mild traumatic brain injury.</p></fn><fn id="table2fn17"><p><sup>q</sup>UAR: unweighted average recall.</p></fn><fn id="table2fn18"><p><sup>r</sup>AST: audio spectrogram transformer.</p></fn><fn id="table2fn19"><p><sup>s</sup>Eigen-CAM: Eigen class activation mapping.</p></fn><fn id="table2fn20"><p><sup>t</sup>PD: Parkinson disease.</p></fn><fn id="table2fn21"><p><sup>u</sup>PTSD: posttraumatic stress disorder.</p></fn><fn id="table2fn22"><p><sup>v</sup>DNN: deep neural network.</p></fn><fn id="table2fn23"><p><sup>w</sup>SHAP: Shapley Additive Explanations.</p></fn><fn id="table2fn24"><p><sup>x</sup>TBI: traumatic brain injury.</p></fn><fn id="table2fn25"><p><sup>y</sup>cGRU: cascading gated recurrent unit.</p></fn><fn id="table2fn26"><p><sup>z</sup>GradientSHAP: gradient Shapley Additive Explanations.</p></fn><fn id="table2fn27"><p><sup>aa</sup>BiLSTM: bidirectional long short-term memory.</p></fn><fn id="table2fn28"><p><sup>ab</sup>DistilBERT: distilled version of Bidirectional Encoder Representations from Transformers.</p></fn><fn id="table2fn29"><p><sup>ac</sup>LIME: local interpretable model-agnostic explanations.</p></fn><fn id="table2fn30"><p><sup>ad</sup>AD: Alzheimer disease.</p></fn><fn id="table2fn31"><p><sup>ae</sup>xDMFCC: explainable deep learning mel-frequency cepstral coefficients.</p></fn><fn id="table2fn32"><p><sup>af</sup>MFCC: mel-frequency cepstral coefficient.</p></fn><fn id="table2fn33"><p><sup>ag</sup>TDNN-HMM: time-delay neural network&#x2014;hidden Markov model</p></fn><fn id="table2fn34"><p><sup>ah</sup>CTC: connectionist temporal classification.</p></fn><fn id="table2fn35"><p><sup>ai</sup>LAS: Listen, Attend, and Spell model architecture.</p></fn><fn id="table2fn36"><p><sup>aj</sup>WER: word error rate.</p></fn><fn id="table2fn37"><p><sup>ak</sup>TDNN: time delay neural network.</p></fn><fn id="table2fn38"><p><sup>al</sup>BERT: Bidirectional Encoder Representations From Transformer.</p></fn><fn id="table2fn39"><p><sup>am</sup>ELECTRA: Efficiently Learning an Encoder That Classifies Token Replacements Accurately.</p></fn><fn id="table2fn40"><p><sup>an</sup>TERA: Transformer Encoder Representations From Alteration.</p></fn><fn id="table2fn41"><p><sup>ao</sup>FC: fully connected layer.</p></fn><fn id="table2fn42"><p><sup>ap</sup>TLC: Thought, Language, and Communication.</p></fn><fn id="table2fn43"><p><sup>aq</sup>PNASS: Positive and Negative Syndrome Scale.</p></fn><fn id="table2fn44"><p><sup>ar</sup>ZCR-DNN: deep neural network with zero-crossing rate features as input.</p></fn><fn id="table2fn45"><p><sup>as</sup>WNSA-Net: axial-attention-based network using wideband and narrowband spectrograms.</p></fn><fn id="table2fn46"><p><sup>at</sup>TORGO: database of acoustic and articulatory speech from speakers with dysarthria (University of Toronto).</p></fn><fn id="table2fn47"><p><sup>au</sup>GNN: graph neural network.</p></fn><fn id="table2fn48"><p><sup>av</sup>GRU: gated recurrent unit.</p></fn><fn id="table2fn49"><p><sup>aw</sup>CQCC: constant Q cepstral coefficients.</p></fn><fn id="table2fn50"><p><sup>ax</sup>LiGRU: light gated recurrent unit.</p></fn><fn id="table2fn51"><p><sup>ay</sup>BiGRU: bidirectional gated recurrent unit.</p></fn><fn id="table2fn52"><p><sup>az</sup>PCC: Pearson correlation coefficient.</p></fn><fn id="table2fn53"><p><sup>ba</sup>ASD: autism spectrum disorder.</p></fn><fn id="table2fn54"><p><sup>bb</sup>TD: typically developing.</p></fn><fn id="table2fn55"><p><sup>bc</sup>eGeMAPS: extended Geneva Minimalistic Acoustic Parameter Set.</p></fn><fn id="table2fn56"><p><sup>bd</sup>SVD: Saarbr&#x00FC;cken Voice Database.</p></fn></table-wrap-foot></table-wrap><p>The included studies applied explainability methods across a broad range of voice- and speech-related health domains, including voice and structural laryngeal pathology [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref59">59</xref>], Parkinson disease (PD) [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>], dysarthria and automatic dysarthric speech recognition [<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref51">51</xref>], dementia [<xref ref-type="bibr" rid="ref9">9</xref>] and Alzheimer disease (AD) detection [<xref ref-type="bibr" rid="ref12">12</xref>], psychiatric and mental health conditions [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref55">55</xref>], traumatic brain injury (TBI) and focal brain lesions [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref54">54</xref>], aphasia [<xref ref-type="bibr" rid="ref82">82</xref>], cleft palate-related hypernasality assessment [<xref ref-type="bibr" rid="ref7">7</xref>], head and neck cancer&#x2013;related intelligibility assessment [<xref ref-type="bibr" rid="ref58">58</xref>], and autism spectrum disorder [<xref ref-type="bibr" rid="ref60">60</xref>].</p><p>Datasets used across the included studies varied in quality, scale, language, and provenance. Most datasets were collected in controlled clinical environments, with only a single study [<xref ref-type="bibr" rid="ref52">52</xref>] relying on remotely collected speech data. Dataset sizes ranged from approximately 15 participants to several hundred. <xref ref-type="table" rid="table3">Table 3</xref> reports dataset sizes and tasks reported in their respective data collection protocols, not the studies included in this literature review. The datasets used represent a range of linguistic backgrounds, with English, Chinese, and Korean datasets being the most prevalent. Publicly available benchmark datasets were predominantly used in studies focusing on neurodegenerative diseases and motor speech disorders, whereas studies targeting psychiatric conditions relied exclusively on institution-specific or private datasets (eg, TORGO, UASpeech (Universal Access Speech), and PC-GITA [Parkinson Corpus &#x2013; Grupo de Investigaci&#x00F3;n en Telecomunicaciones Aplicadas]). Speech elicitation tasks varied across studies and included sustained vowel phonation, diadochokinesis, read speech, picture description, and free or spontaneous speech.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Overview of speech and voice datasets used in the reviewed studies, including clinical domain, language, participant population, elicited speech tasks, and study usage. The datasets encompass a wide range of domains, sizes, and languages. UASpeech<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> was the most frequently used publicly available benchmark, while other studies relied on their own data collection protocols.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset name</td><td align="left" valign="bottom">Application/domain</td><td align="left" valign="bottom">Language</td><td align="left" valign="bottom">Population</td><td align="left" valign="bottom">Tasks</td><td align="left" valign="bottom">Used by</td></tr></thead><tbody><tr><td align="left" valign="top">ADReSS<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> (DementiaBank) [<xref ref-type="bibr" rid="ref83">83</xref>]</td><td align="left" valign="top">Alzheimer disease</td><td align="left" valign="top">English</td><td align="left" valign="top">156 subjects (78 AD<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>, 78 controls)</td><td align="left" valign="top">Cookie theft picture</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]</td></tr><tr><td align="left" valign="top">Pitt Corpus (DementiaBank) [<xref ref-type="bibr" rid="ref84">84</xref>]</td><td align="left" valign="top">Alzheimer disease</td><td align="left" valign="top">English</td><td align="left" valign="top">500 (253 AD, 247 controls)</td><td align="left" valign="top">Cookie theft picture, word fluency task, spontaneous interviews</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref9">9</xref>]</td></tr><tr><td align="left" valign="top">AVH<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> Voice Diaries Dataset [<xref ref-type="bibr" rid="ref52">52</xref>]</td><td align="left" valign="top">Auditory verbal hallucinations</td><td align="left" valign="top">English</td><td align="left" valign="top">384 participants</td><td align="left" valign="top">30-day audio diary recordings</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref52">52</xref>]</td></tr><tr><td align="left" valign="top">SNUBH<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup> Infant Dataset [<xref ref-type="bibr" rid="ref60">60</xref>]</td><td align="left" valign="top">Autism</td><td align="left" valign="top">Korean</td><td align="left" valign="top">39 infants (10 ASD<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup>, 29 TD<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup>)</td><td align="left" valign="top">Clinical vocalizations during ASD assessment</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref60">60</xref>]</td></tr><tr><td align="left" valign="top">LANNA<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup> Speech Corpus [<xref ref-type="bibr" rid="ref85">85</xref>]</td><td align="left" valign="top">Specific language impairment</td><td align="left" valign="top">Czech</td><td align="left" valign="top">188 children (118 SLI<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup>, 70 controls)</td><td align="left" valign="top">Vowels, consonants, syllables, words, sentences, picture description</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref48">48</xref>]</td></tr><tr><td align="left" valign="top">Max-Planck Brain Lesion Dataset [<xref ref-type="bibr" rid="ref54">54</xref>]</td><td align="left" valign="top">Brain lesions</td><td align="left" valign="top">Dutch</td><td align="left" valign="top">16 patients with lesions, 16 controls</td><td align="left" valign="top">Emotion-elicited word production</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref54">54</xref>]</td></tr><tr><td align="left" valign="top">Americleft Database [<xref ref-type="bibr" rid="ref86">86</xref>]</td><td align="left" valign="top">Cleft palate</td><td align="left" valign="top">English</td><td align="left" valign="top">60 children with CP<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup>, 10 controls</td><td align="left" valign="top">Sentences</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref7">7</xref>]</td></tr><tr><td align="left" valign="top">NMCPC<sup><xref ref-type="table-fn" rid="table3fn11">k</xref></sup> Database [<xref ref-type="bibr" rid="ref87">87</xref>]</td><td align="left" valign="top">Cleft palate</td><td align="left" valign="top">English</td><td align="left" valign="top">32 children with CP, 9 controls</td><td align="left" valign="top">Sentences</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref7">7</xref>]</td></tr><tr><td align="left" valign="top">JCCOCC<sup><xref ref-type="table-fn" rid="table3fn12">l</xref></sup> MoCA<sup><xref ref-type="table-fn" rid="table3fn13">m</xref></sup> Cantonese Speech corpus [<xref ref-type="bibr" rid="ref88">88</xref>]</td><td align="left" valign="top">Cognitive impairment</td><td align="left" valign="top">Cantonese</td><td align="left" valign="top">469 speakers</td><td align="left" valign="top">Cognitive assessment interviews</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref19">19</xref>]</td></tr><tr><td align="left" valign="top">Bellevue Trauma Dataset [<xref ref-type="bibr" rid="ref89">89</xref>]</td><td align="left" valign="top">Depression or PTSD<sup><xref ref-type="table-fn" rid="table3fn14">n</xref></sup></td><td align="left" valign="top">English, Spanish, and Mandarin</td><td align="left" valign="top">377 (first round) 221 (second round)</td><td align="left" valign="top">Clinical interviews</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref51">51</xref>]</td></tr><tr><td align="left" valign="top">CMDC<sup><xref ref-type="table-fn" rid="table3fn15">o</xref></sup> [<xref ref-type="bibr" rid="ref90">90</xref>]</td><td align="left" valign="top">Depression</td><td align="left" valign="top">Cantonese</td><td align="left" valign="top">78 speakers</td><td align="left" valign="top">Structured interviews (audio, video, text)</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref15">15</xref>]</td></tr><tr><td align="left" valign="top">DAIC-WOZ<sup><xref ref-type="table-fn" rid="table3fn16">p</xref></sup> [<xref ref-type="bibr" rid="ref91">91</xref>]</td><td align="left" valign="top">Depression, PTSD, anxiety</td><td align="left" valign="top">English</td><td align="left" valign="top">189 speakers</td><td align="left" valign="top">Semistructured clinical interviews (audio, video, text)</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref15">15</xref>]</td></tr><tr><td align="left" valign="top">CUDYS<sup><xref ref-type="table-fn" rid="table3fn17">q</xref></sup> Corpus [<xref ref-type="bibr" rid="ref92">92</xref>]</td><td align="left" valign="top">Dysarthria</td><td align="left" valign="top">Cantonese</td><td align="left" valign="top">27 dysarthric speakers</td><td align="left" valign="top">Short sentence recordings</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref18">18</xref>]</td></tr><tr><td align="left" valign="top">TORGO<sup><xref ref-type="table-fn" rid="table3fn18">r</xref></sup> [<xref ref-type="bibr" rid="ref93">93</xref>]</td><td align="left" valign="top">Dysarthria</td><td align="left" valign="top">English</td><td align="left" valign="top">15 speakers (7 dysarthric, 7 controls)</td><td align="left" valign="top">Nonwords, words, sentences</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]</td></tr><tr><td align="left" valign="top">UASpeech<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> [<xref ref-type="bibr" rid="ref94">94</xref>]</td><td align="left" valign="top">Dysarthria</td><td align="left" valign="top">English</td><td align="left" valign="top">29 speakers (16 dysarthric, 13 controls)</td><td align="left" valign="top">Isolated words</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]</td></tr><tr><td align="left" valign="top">KSoF<sup><xref ref-type="table-fn" rid="table3fn19">s</xref></sup> [<xref ref-type="bibr" rid="ref95">95</xref>]</td><td align="left" valign="top">Fluency/stuttering</td><td align="left" valign="top">German</td><td align="left" valign="top">37 speakers</td><td align="left" valign="top">Therapy-based speech recordings</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref49">49</xref>]</td></tr><tr><td align="left" valign="top">SEP-28k-E<sup><xref ref-type="table-fn" rid="table3fn20">t</xref></sup> [<xref ref-type="bibr" rid="ref96">96</xref>]</td><td align="left" valign="top">Fluency/stuttering</td><td align="left" valign="top">English</td><td align="left" valign="top">21,857 three-second clips (23 h)</td><td align="left" valign="top">Spontaneous speech (podcasts)</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref49">49</xref>]</td></tr><tr><td align="left" valign="top">BREF<sup><xref ref-type="table-fn" rid="table3fn21">u</xref></sup> [<xref ref-type="bibr" rid="ref97">97</xref>]</td><td align="left" valign="top">Healthy speech</td><td align="left" valign="top">French</td><td align="left" valign="top">&#x2248;120 speakers (100 h)</td><td align="left" valign="top">Newspaper reading</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref58">58</xref>]</td></tr><tr><td align="left" valign="top">C2SI-LEC<sup><xref ref-type="table-fn" rid="table3fn22">v</xref></sup> [<xref ref-type="bibr" rid="ref98">98</xref>]</td><td align="left" valign="top">Head and neck cancer</td><td align="left" valign="top">French</td><td align="left" valign="top">94 patients, 41 controls</td><td align="left" valign="top">Pseudo-words, image description, read speech</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref58">58</xref>]</td></tr><tr><td align="left" valign="top">DIRAMS<sup><xref ref-type="table-fn" rid="table3fn23">w</xref></sup> dataset [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="top">Thyroidectomy speech impairment</td><td align="left" valign="top">Korean</td><td align="left" valign="top">114 patients (preoperation, 2-wk postoperation, 3 mo postoperation)</td><td align="left" valign="top">1&#x2010;20 s utterance</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref6">6</xref>]</td></tr><tr><td align="left" valign="top">PC-GITA<sup><xref ref-type="table-fn" rid="table3fn24">x</xref></sup> [<xref ref-type="bibr" rid="ref99">99</xref>]</td><td align="left" valign="top">PD<sup><xref ref-type="table-fn" rid="table3fn25">y</xref></sup></td><td align="left" valign="top">Spanish</td><td align="left" valign="top">100 subjects (50 patients with PD, 50 controls)</td><td align="left" valign="top">Sustained vowel, DDK<sup><xref ref-type="table-fn" rid="table3fn26">z</xref></sup>, words, sentences, read passage, free speech</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]</td></tr><tr><td align="left" valign="top">Sangmyung University PD Dataset [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">PD</td><td align="left" valign="top">Korean</td><td align="left" valign="top">200 speakers (100 PD, 100 controls)</td><td align="left" valign="top">Vowels, consonants, DDK</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref10">10</xref>]</td></tr><tr><td align="left" valign="top">Ruhr University PD Dataset [<xref ref-type="bibr" rid="ref100">100</xref>]</td><td align="left" valign="top">PD</td><td align="left" valign="top">German</td><td align="left" valign="top">168 idiopathic PDs</td><td align="left" valign="top">DDK, read passage</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref13">13</xref>]</td></tr><tr><td align="left" valign="top">CzechPD [<xref ref-type="bibr" rid="ref101">101</xref>]</td><td align="left" valign="top">PD</td><td align="left" valign="top">Czech</td><td align="left" valign="top">46 speakers (23 PD, 23 controls)</td><td align="left" valign="top">Sustained vowel, DDK, read passage, free speech</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref13">13</xref>]</td></tr><tr><td align="left" valign="top">Sichuan University Schizophrenia Dataset [<xref ref-type="bibr" rid="ref48">48</xref>]</td><td align="left" valign="top">Schizophrenia</td><td align="left" valign="top">Mandarin</td><td align="left" valign="top">28 patients, 28 controls</td><td align="left" valign="top">Emotion-elicited reading</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref48">48</xref>]</td></tr><tr><td align="left" valign="top">NTUH<sup><xref ref-type="table-fn" rid="table3fn27">aa</xref></sup> Schizophrenia Dataset [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">Schizophrenia</td><td align="left" valign="top">Taiwanese</td><td align="left" valign="top">26 patients</td><td align="left" valign="top">Clinical interviews</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref14">14</xref>]</td></tr><tr><td align="left" valign="top">Coelho Corpus [<xref ref-type="bibr" rid="ref102">102</xref>]</td><td align="left" valign="top">TBI<sup><xref ref-type="table-fn" rid="table3fn28">ab</xref></sup></td><td align="left" valign="top">English</td><td align="left" valign="top">55 TBI, 52 controls</td><td align="left" valign="top">Memory task, picture description</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref53">53</xref>]</td></tr><tr><td align="left" valign="top">Adolescent mTBI<sup><xref ref-type="table-fn" rid="table3fn29">ac</xref></sup> Dataset [<xref ref-type="bibr" rid="ref47">47</xref>]</td><td align="left" valign="top">mTBI</td><td align="left" valign="top">English</td><td align="left" valign="top">72 concussion, 93 controls</td><td align="left" valign="top">Multisyllabic word reading</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref47">47</xref>]</td></tr><tr><td align="left" valign="top">FEMH<sup><xref ref-type="table-fn" rid="table3fn30">ad</xref></sup> Speech Disorder Database [<xref ref-type="bibr" rid="ref103">103</xref>]</td><td align="left" valign="top">Voice disorders</td><td align="left" valign="top">Mandarin</td><td align="left" valign="top">1061 samples (101 neoplasm, 100 functional dysphonia, 124 vocal palsy, 718 phonotrauma, 100 normal)</td><td align="left" valign="top">Sustained vowel</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref5">5</xref>]</td></tr><tr><td align="left" valign="top">Saarbr&#x00FC;cken Voice Database [<xref ref-type="bibr" rid="ref104">104</xref>]</td><td align="left" valign="top">Voice disorders</td><td align="left" valign="top">German</td><td align="left" valign="top">687 healthy, 1356 patients</td><td align="left" valign="top">Sustained vowels, pitch glides, read phrase</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]</td></tr><tr><td align="left" valign="top">VOICED<sup><xref ref-type="table-fn" rid="table3fn31">ae</xref></sup> [<xref ref-type="bibr" rid="ref105">105</xref>]</td><td align="left" valign="top">Voice disorders</td><td align="left" valign="top">Italian</td><td align="left" valign="top">208 samples (150 pathological, 58 healthy)</td><td align="left" valign="top">Sustained vowel</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]</td></tr><tr><td align="left" valign="top">Yeouido St. Mary Hospital of the Catholic University Voice Dataset [<xref ref-type="bibr" rid="ref59">59</xref>]</td><td align="left" valign="top">Voice disorder</td><td align="left" valign="top">Korean</td><td align="left" valign="top">30 laryngeal cancer, 97 vocal fold paralysis, 81 benign mucosal disease, 155 controls</td><td align="left" valign="top">Sustained vowel</td><td align="left" valign="top">[<xref ref-type="bibr" rid="ref59">59</xref>]</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>UASpeech: Universal Access Speech.</p></fn><fn id="table3fn2"><p><sup>b</sup>ADReSS: Alzheimer's Dementia Recognition Through Spontaneous Speech.</p></fn><fn id="table3fn3"><p><sup>c</sup>AD: Alzheimer disease.</p></fn><fn id="table3fn4"><p><sup>d</sup>AVH: auditory verbal hallucination.</p></fn><fn id="table3fn5"><p><sup>e</sup>SNUBH: Seoul National University Bundang Hospital.</p></fn><fn id="table3fn6"><p><sup>f</sup>ASD: autism spectrum disorder.</p></fn><fn id="table3fn7"><p><sup>g</sup>TD: typically developing.</p></fn><fn id="table3fn8"><p><sup>h</sup>LANNA: Laboratory of Artificial Neural Network Applications.</p></fn><fn id="table3fn9"><p><sup>i</sup>SLI: specific language impairment.</p></fn><fn id="table3fn10"><p><sup>j</sup>CP: cleft palate.</p></fn><fn id="table3fn11"><p><sup>k</sup>NMCPC: New Mexico Cleft Palate Center.</p></fn><fn id="table3fn12"><p><sup>l</sup>JCCOCC: Jockey Club Centre for Osteoporosis Care and Control.</p></fn><fn id="table3fn13"><p><sup>m</sup>MoCA: Montreal Cognitive Assessment.</p></fn><fn id="table3fn14"><p><sup>n</sup>PTSD: posttraumatic stress disorder.</p></fn><fn id="table3fn15"><p><sup>o</sup>CMDC: Chinese Multimodal Depression Corpus.</p></fn><fn id="table3fn16"><p><sup>p</sup>DAIC-WOZ: Distress Analysis Interview Corpus &#x2013; Wizard of Oz.</p></fn><fn id="table3fn17"><p><sup>q</sup>CUDYS: Chinese University of Hong Kong Dysarthric Speech.</p></fn><fn id="table3fn18"><p><sup>r</sup>TORGO: database of acoustic and articulatory speech from speakers with dysarthria (University of Toronto).</p></fn><fn id="table3fn19"><p><sup>s</sup>KSoF: Kassel State of Fluency.</p></fn><fn id="table3fn20"><p><sup>t</sup>SEP-28k-E: Stuttering Events in Podcasts (extended).</p></fn><fn id="table3fn21"><p><sup>u</sup>BREF: a large read-speech corpus for French (Computer Science Laboratory for Mechanics and Engineering Sciences &#x2013; French National Centre for Scientific Research).</p></fn><fn id="table3fn22"><p><sup>v</sup>C2SI-LEC: Carcinologic Speech Severity Index corpus &#x2013; short text reading task. </p></fn><fn id="table3fn23"><p><sup>w</sup>DIRAMS: Dongnam Institute of Radiological and Medical Sciences.</p></fn><fn id="table3fn24"><p><sup>x</sup>PC-GITA: Parkinson Corpus &#x2013; Grupo de Investigaci&#x00F3;n en Telecomunicaciones Aplicadas.</p></fn><fn id="table3fn25"><p><sup>y</sup>PD: Parkinson disease.</p></fn><fn id="table3fn26"><p><sup>z</sup>DDK: diadochokinesis.</p></fn><fn id="table3fn27"><p><sup>aa</sup>NTUH: National Taiwan University Hospital.</p></fn><fn id="table3fn28"><p><sup>ab</sup>TBI: traumatic brain injury.</p></fn><fn id="table3fn29"><p><sup>ac</sup>mTBI: mild traumatic brain injury.</p></fn><fn id="table3fn30"><p><sup>ad</sup>FEMH: Far Eastern Memorial Hospital.</p></fn><fn id="table3fn31"><p><sup>ae</sup>VOICED: Voice Icarfederico II.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Explainability Methods</title><p>Explainability methods used in the included literature are summarized in <xref ref-type="table" rid="table1">Table 1</xref>. Across the 30 included studies, gradient-based saliency methods, input perturbation techniques, and model-internal representation analysis were the most frequently used explainability approaches.</p><p>Gradient-based saliency methods were reported in 7 studies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref50">50</xref>], with Grad-CAM being the most commonly used technique, appearing in 5 studies. Grad-CAM [<xref ref-type="bibr" rid="ref106">106</xref>] is a model-specific technique that uses gradients of convolutional feature maps to generate class-discriminative explanations as coarse (ie, non&#x2013;pixel-level) heatmaps. Other gradient-based approaches, including guided backpropagation and vanilla saliency maps, were rarely used [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Vanilla saliency [<xref ref-type="bibr" rid="ref107">107</xref>] computes pixel-level importance by calculating gradients of the model output (ie, logits) with respect to the input features, where the magnitude of the gradient at each feature indicates its importance to the model&#x2019;s decision. Guided backpropagation [<xref ref-type="bibr" rid="ref108">108</xref>] differs from vanilla saliency maps in that gradients are only propagated when both the forward activation and the backward gradient are positive, resulting in cleaner, less noisy, and more visually interpretable maps. Unlike Grad-CAM, vanilla saliency and guided backpropagation operate directly at the input level and do not provide explicit, spatially localized class-discriminative explanations.</p><p>Input perturbation-based techniques were also widely adopted across applications. SHAP was implemented in 4 studies spanning dementia detection [<xref ref-type="bibr" rid="ref9">9</xref>], auditory verbal hallucination assessment [<xref ref-type="bibr" rid="ref52">52</xref>], psychiatric disorder classification [<xref ref-type="bibr" rid="ref51">51</xref>], and TBI [<xref ref-type="bibr" rid="ref53">53</xref>]. SHAP [<xref ref-type="bibr" rid="ref109">109</xref>] is a model-agnostic explainability method grounded in cooperative game theory that attributes to each feature a contribution score, quantifying how the presence or absence of that feature influences a model&#x2019;s prediction. Ablation-based analysis was the most prevalent perturbation method, reported in 9 studies, and was applied across multiple clinical tasks [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref82">82</xref>]. Ablation studies usually involve systematic removal of specific parts of the model or input features and observing the resulting impact on model performance.</p><p>Model-internal representation analysis was reported in 12 studies, where interpretability was derived through inspection or visualization of learned internal model representations. Although primarily used as a dimensionality reduction technique, t-distributed stochastic neighbor embedding (t-SNE) [<xref ref-type="bibr" rid="ref110">110</xref>] dominated this category, appearing in 9 studies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>], for its utility in mapping high-dimensional latent representations into low-dimensional space while preserving local neighborhood relationships. Additional representation-level analysis included inspection of learned convolutional filters [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref20">20</xref>], parameterized filter structures (ie, Sinc filters) [<xref ref-type="bibr" rid="ref5">5</xref>], and dimensionality reduction of feature maps (ie, Eigen class activation maps [Eigen-CAM]) [<xref ref-type="bibr" rid="ref10">10</xref>], providing insight into how internal model components responded to speech signals. Sinc filters are parameterized to directly model frequency bands, allowing them to more effectively emphasize formant frequencies. As Sinc filters effectively function as bandpass filters, they offer conceptually more interpretable representations compared to the abstract patterns learned by arbitrarily shaped CNN filters [<xref ref-type="bibr" rid="ref5">5</xref>]. Unlike Grad-CAM, Eigen-CAM [<xref ref-type="bibr" rid="ref111">111</xref>] is a gradient-free method that calculates saliency maps by taking the first principal component of flattened feature maps.</p><p>Surrogate model-based methods were less frequently used among the included literature [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref54">54</xref>]. Most notably, LIME [<xref ref-type="bibr" rid="ref112">112</xref>] is a model-agnostic explainability method that learns a simple, interpretable surrogate model, such as linear models or decision trees, that locally approximates the predictions of a black-box model. An adaptation of LIME, explainable deep learning mel-frequency cepstral coefficients (xDMFCC), is another surrogate-based model designed for interpreting MFCC-based audio representations [<xref ref-type="bibr" rid="ref54">54</xref>].</p><p>Concept-based methods, which link internal model activations to predefined, human-interpretable concepts to explain predictions in domain-relevant terms, were implemented in 2 studies [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref58">58</xref>].</p><p>Lastly, attention-based explanations were reported in 2 studies, where interpretability was derived either through direct visualization of attention weights [<xref ref-type="bibr" rid="ref113">113</xref>] or through attention rollout-based aggregation of attention across layers [<xref ref-type="bibr" rid="ref57">57</xref>].</p></sec><sec id="s3-3"><title>Explainability Input-Output Representation</title><p>Reported explainability methods operated over a range of input representations, which ultimately defined the form and interpretability of the resulting explanations.</p><p>Time-frequency and cepstral representations, including wideband and narrowband spectrograms, mel-spectrograms, and MFCCs, were commonly interrogated by gradient-based saliency methods [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. In these cases, explanations were presented as heatmaps highlighting salient temporal segments and spectral bins of the input representation.</p><p>Surrogate-based methods and input perturbation techniques typically operate over cepstral representations and acoustic low-level descriptors (LLDs) [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. These approaches produced explanations in the form of numerical feature importance values and ranking scores. Various audio representations were used for ablation analysis, where interpretable insight was based on tabular performance changes, following feature removal or modification [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref82">82</xref>].</p><p>Model-internal representation analyses, such as t-SNE, derived interpretability from latent representations learned by deep models, with explanations expressed as 2D projections revealing structure or separation in the latent space [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>]. In a small number of studies, parameterized filter models operating directly on raw waveforms were used, where interpretability was provided through filter frequency responses identifying salient frequency bands [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref53">53</xref>].</p><p>Finally, concept-based approaches operated on frame-level cepstral representations, where neural activations were inspected in relation to predefined, human-interpretable semantic or clinically meaningful concepts [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref58">58</xref>].</p></sec><sec id="s3-4"><title>Domain-Specific Explanation Patterns</title><sec id="s3-4-1"><title>Overview</title><p>Outputs of explainability and interpretability methods were synthesized according to the target clinical application, thus identifying recurring clinical interpretability themes across the literature for each outcome domain.</p></sec><sec id="s3-4-2"><title>Voice and Structural Laryngeal Pathology</title><p>Explanations within the voice disorder classification and laryngeal pathology literature indicate saliency across low- to midfrequency regions associated with formant structure and harmonic organization, as well as high-frequency components indicative of noise and phonatory instability. For example, for voice disorders classification, gradient-based methods emphasized low-amplitude, high-frequency spectrotemporal regions [<xref ref-type="bibr" rid="ref4">4</xref>], while disorder-specific band emphasis patterns were reported in the study by Peng et al [<xref ref-type="bibr" rid="ref3">3</xref>]. On a similar note, t-SNE analysis for pooled CNN features revealed overlap for benign and malignant laryngeal diseases [<xref ref-type="bibr" rid="ref59">59</xref>]. On the other hand, parameterized Sinc filters were found to more explicitly capture F1 and F2 frequencies and energy bands compared to traditional CNN filters [<xref ref-type="bibr" rid="ref5">5</xref>]. Postoperative recovery assessment using GRBAS (Grade, Roughness, Breathiness, Asthenia, Strain) scores showed distinct activation patterns within the 0&#x2010;2 kHz region typical of formant structure, the 2&#x2010;4 kHz band associated with harmonics and noise components, and temporally localized regions corresponding to pauses and breathiness [<xref ref-type="bibr" rid="ref6">6</xref>]. In head and neck cancer, articulatory attributes were found to be important for intelligibility assessment through the discovery of progressively increasing phonetic feature detectors (neurons) across deeper layers of the concept detector network [<xref ref-type="bibr" rid="ref58">58</xref>]. Ultimately, explanations consistently conveyed the importance of phonatory control, harmonic structure, and articulatory clarity as commonly highlighted cues.</p><p>While the studies by Peng et al [<xref ref-type="bibr" rid="ref3">3</xref>] and Shaikh et al [<xref ref-type="bibr" rid="ref4">4</xref>] relied on the VOICED (Voice Icarfederico II) dataset, other studies based their investigations on distinct public or locally collected datasets. Thus, the aforementioned explanation patterns do not seem to be confined to a single dataset.</p></sec><sec id="s3-4-3"><title>Parkinson Disease</title><p>Insights drawn from explainability analysis revealed degraded spectral patterns and impaired articulatory transitions consistent with hypokinetic dysarthria. Eigen-CAM emphasized higher frequencies typical of muffled and degraded speech [<xref ref-type="bibr" rid="ref10">10</xref>]. CNN filter analysis indicated slower and less distinct phoneme transitions for patients with PD [<xref ref-type="bibr" rid="ref13">13</xref>]. Similarly, ablation studies underscored the importance of long-term temporal dependencies and cepstral dynamics demonstrating the altered articulatory dynamics of patients with PD [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>It is worth noting that the studies by Lahoti et al [<xref ref-type="bibr" rid="ref11">11</xref>] and Vasquez-Correa et al [<xref ref-type="bibr" rid="ref13">13</xref>] used the same dataset (PC-GITA); therefore, convergent findings between these 2 studies should be interpreted with appropriate context.</p></sec><sec id="s3-4-4"><title>Dysarthria and Dysarthric Speech Recognition</title><p>Ablation analyses in dysarthria severity classification and automatic dysarthric speech recognition applications identified various spectral and temporal features as significant contributors to model decisions. Guided backpropagation revealed diffuse and less localized activation patterns in high-severity dysarthric cases, in contrast to more focused vowel-centered saliency in milder cases [<xref ref-type="bibr" rid="ref50">50</xref>]. In ASR models, ablation and latent representation analyses emphasized the importance of temporal dependency modeling through time delay neural network architectures [<xref ref-type="bibr" rid="ref18">18</xref>], speaker adaptation mechanisms [<xref ref-type="bibr" rid="ref20">20</xref>], and low-quefrency components reflecting vocal tract characteristics [<xref ref-type="bibr" rid="ref19">19</xref>]. Additionally, MFCC-, constant Q cepstral coefficients&#x2013;, and articulatory-based feature sets demonstrated varying performance across speaker-dependent and speaker-independent ASR systems [<xref ref-type="bibr" rid="ref56">56</xref>]. Notably, the findings across these studies do not constitute independent validation, as all dysarthria-focused experiments were conducted using the same dataset (UASpeech), as shown in <xref ref-type="table" rid="table3">Table 3</xref>. Therefore, these findings are data-specific and are not generalizable conclusions.</p></sec><sec id="s3-4-5"><title>Dementia and Alzheimer Disease</title><p>In dementia detection, SHAP and LIME analysis identified noun phrase rate, empty word rate, and hesitation ratio as influential linguistic and fluency features indicative of decreased lexical complexity and increased disfluency in AD speech, alongside voice-related features reflecting lower vocal energy [<xref ref-type="bibr" rid="ref9">9</xref>]. In addition to linguistic attributes, graph-based models in the study by Laguarta and Subirana [<xref ref-type="bibr" rid="ref12">12</xref>] highlighted memory-related biomarkers in early-stage AD. It is important to note that both studies used the ADReSS (Alzheimer's Dementia Recognition Through Spontaneous Speech) dataset; therefore, their convergent findings should be interpreted within the context of the same dataset.</p></sec><sec id="s3-4-6"><title>Psychiatric Disorders</title><p>Explainability and interpretability analyses across psychiatric disorders were found to emphasize the importance of vocal-spectral attributes, linguistic content, and temporally localized affective states. Grad-CAM heatmaps revealed reduced high-frequency energy in schizophrenic speech and highlighted altered formant contours compared to controls, typical of articulatory disruption [<xref ref-type="bibr" rid="ref48">48</xref>]. Ablation analysis in the study by He et al [<xref ref-type="bibr" rid="ref55">55</xref>] showed that wideband spectrograms, capturing transient events and articulatory changes, and narrowband spectrograms, emphasizing pitch variation and voice quality, provided complementary representations for schizophrenia detection. SHAP analyses of multimodal models identified reduced voice intensity alongside linguistic markers of negative affect and self-focus as important predictors for posttraumatic stress disorder and major depressive disorder [<xref ref-type="bibr" rid="ref51">51</xref>]. Similarly, voice quality and prosodic features ranked among the top SHAP predictors for auditory verbal hallucination detection, while attention heatmaps prioritized textual clauses reflecting distress and interference [<xref ref-type="bibr" rid="ref52">52</xref>]. In the study by Zhang et al [<xref ref-type="bibr" rid="ref15">15</xref>], self-attention mechanisms applied to shorter audio segments enhanced depression detection, suggesting the temporal locality of depressive state-related cues. Across psychiatric applications, explanations frequently reflected prosodic flattening, altered spectral energy distribution, and content-level emotional markers.</p></sec><sec id="s3-4-7"><title>Traumatic Brain Injury and Brain Lesion Detection</title><p>In TBI, GradientSHAP identified high-frequency spectral patterns and filler words as salient markers [<xref ref-type="bibr" rid="ref53">53</xref>], and Grad-CAM highlighted high-frequency components in mild traumatic brain injury classification [<xref ref-type="bibr" rid="ref47">47</xref>]. xDMFCC analysis demonstrated the importance of lower-order cepstral coefficients as indicators of spectral sharpness and speech clarity in brain lesion detection [<xref ref-type="bibr" rid="ref54">54</xref>]. In contrast, higher-order cepstral coefficients did not clearly capture phoneme transitions in patients. The study concluded that brain lesion speech was characterized by delayed onset, slower articulation, and prolonged phoneme duration. Explanations across these studies allude to reduced articulatory precision and spectral clarity as discriminatory markers.</p></sec><sec id="s3-4-8"><title>Other Clinical Domains</title><p>Several additional clinical applications were represented by single studies and therefore did not permit cross-study pattern synthesis.</p><p>The work by Mathad et al [<xref ref-type="bibr" rid="ref7">7</xref>] performed a hypernasality assessment using a concept detector network to estimate posterior probabilities of nasal and oral phoneme classes, which were subsequently combined into an objective hypernasality measure. Similarly, Abderrazek et al [<xref ref-type="bibr" rid="ref58">58</xref>] used a concept detector framework for head and neck cancer intelligibility assessment, using a French phone classifier to identify phonetic feature detectors (ie, internal neurons) that informed a quantitative intelligibility metric. Ablation analysis in the study by Herath et al [<xref ref-type="bibr" rid="ref82">82</xref>] demonstrated the superiority of MFCCs over alternative spectral and temporal representations for aphasia severity classification. For autism spectrum disorder detection, t-SNE plots revealed clearer separation of autoencoder embeddings compared to handcrafted acoustic features [<xref ref-type="bibr" rid="ref60">60</xref>]. Finally, in the study by Shen and Zhang [<xref ref-type="bibr" rid="ref49">49</xref>], they applied time-related Grad-CAM to highlight temporally localized activations aligned with manually-annotated stutter disfluency segments.</p></sec><sec id="s3-4-9"><title>Explainability Validation Strategies</title><p>Of the 30 included studies, 6 performed model-centric explainability validation using external data [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref58">58</xref>], including 1 study [<xref ref-type="bibr" rid="ref19">19</xref>] that explicitly assessed the cross-language consistency of explanation patterns. Ground-truth or annotation-based verification was reported in a single study [<xref ref-type="bibr" rid="ref49">49</xref>], where salient temporal regions identified by the explainability method were compared against reference labels provided by nonclinical annotators.</p><p>The remaining studies did not conduct quantitative validation of explainability outputs. Instead, 10 studies relied solely on qualitative interpretation of explanations and comparison with findings reported in prior literature [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. In these cases, explanations were assessed narratively for plausibility or consistency with known clinical or acoustic characteristics without formal evaluation of faithfulness, robustness, or stability. Zhang et al [<xref ref-type="bibr" rid="ref9">9</xref>] and Fu et al [<xref ref-type="bibr" rid="ref48">48</xref>] performed cross-dataset evaluation not as consistency analysis for explainability but as external validation for the underlying model. Lastly, 12 studies reported explainability outputs without conducting quantitative analysis or qualitative literature comparisons [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref82">82</xref>].</p></sec><sec id="s3-4-10"><title>Human-Centered Analysis and Stakeholder Alignment</title><p>None of the included studies explicitly reported formal human-centered evaluation of explainability outputs, such as structured assessment of explanations by clinicians, speech-language pathologists, or regulatory stakeholders.</p><p>To contextualize potential stakeholder involvement, we additionally recorded the domain expertise of study authors. Approximately half of the included studies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>-<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref82">82</xref>] were authored exclusively by technical researchers (eg, computer scientists, engineers, or biomedical engineers), while the remaining studies included at least one author with nontechnical domain expertise (eg, clinical, medical, or speech-language pathology background). However, the presence of nontechnical coauthors did not correspond to explicit reporting of human-in-the-loop evaluation of explainability outputs.</p></sec><sec id="s3-4-11"><title>Quality and Risk of Bias Assessment</title><p>A total of 25 studies were eligible for PROBAST+AI assessment, while 5 studies were not assessed because their models did not predict health-related outcomes. Further, 3 of these studies developed automatic dysarthric speech recognition models, and 2 studies developed models predicting phonetic features. Although the outputs of these models were later used to derive clinical measures, the models themselves did not constitute health-related prediction tasks within the scope of PROBAST+AI.</p><p>For model development, 80% (20/25) of studies were judged to have an overall high-quality concern, 16% (4/25) of studies had an overall low-quality concern, and 4% (1/25) of studies received an overall unclear rating. Domain 4 (analysis) was the primary driver of great concern, followed by domain 1 (participants and data sources). In domain 4, concerns were mainly related to small development datasets relative to model complexity and insufficient safeguards against overfitting. In domain 1, concerns stemmed from unclear or restrictive inclusion and exclusion criteria, limiting representativeness. Domains 2 (predictors) and 3 (outcome) were rated as low concern in nearly all studies. Applicability concerns across domains 1&#x2010;3 were rated as low for all assessed studies.</p><p>For model evaluation, 92% (23/25) of studies were judged to have a high risk of bias, 4% (1/25) of studies a low risk, and 4% (1/25) of studies unclear risk. As in development, domains 4 and 1 were the main contributors to elevated risk. Evaluation splits were frequently small and unlikely to be representative, particularly in relation to model complexity. Data leakage was identified in 6 studies. Furthermore, performance assessment rarely extended beyond standard discrimination metrics; calibration assessment and decision-analytic measures were largely absent. Domains 2 and 3 were rated as low risk of bias across all studies. Detailed domain-level ratings for each study are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p></sec></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Methodological Quality of Underlying Prediction Models</title><p>The concentration of quality concerns within the analysis domain, as identified by PROBAST+AI [<xref ref-type="bibr" rid="ref80">80</xref>], has important implications for XAI applications and the interpretability of explanations. The use of complex, high-capacity deep learning models trained and evaluated on limited clinical voice datasets, often without external validation or calibration assessment, increases the risk of overfitting to dataset-specific characteristics rather than generalizable, condition-relevant attributes.</p><p>This is especially important because post hoc explainability methods are contingent on the model&#x2019;s internal representations and input-output relationships. Explanations derived from overfitted or insufficiently validated models may appear coherent while reflecting confounding artifacts or spurious correlations [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref72">72</xref>]. Thus, the credibility of XAI methods does not rest solely on the faithfulness of the explainability technique, but also on the methodological rigor of the development and evaluation of the underlying predictive models.</p><p>These considerations are particularly relevant for the analysis of the domain-specific explanation patterns. Although recurring themes were observed across the clinical domains, the risk of bias of the underlying models and circular validation across studies due to the use of the same dataset should motivate the reader to view the findings or explanations with caution, as they may reflect data-specific characteristics rather than generalizable disease-related insights. Accordingly, the purpose of presenting domain-specific explanation patterns is not to establish definitive clinical explanatory signatures, but rather to characterize the current landscape of explainability practice in voice and speech AI and to motivate more rigorous, validated, and clinically grounded approaches in future work.</p></sec><sec id="s4-2"><title>Interpretation of Explanations</title><p>Although current XAI methods provide preliminary insight into the inner workings of clinical audio models, the interpretation of explainability outputs is rarely subjected to rigorous validation. Many studies rely on a limited number of illustrative examples to interpret local explanations (eg, saliency maps or attention weights) without quantitative assessment or statistical analysis [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. This increases the risk of overinterpretation such that visually compelling or anecdotal explanations are inferred to be clinically meaningful despite limited evidence of generalizability.</p><p>Some studies attempt to contextualize explanation outputs by comparison with established clinical or acoustic knowledge [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref54">54</xref>]. While such comparisons may enhance face validity, they do not guarantee faithfulness of the explanation to the model&#x2019;s true decision process. Prior work has demonstrated, for example, that attention weights are not inherently faithful indicators of feature importance and may exhibit weak or inconsistent correspondence with gradient-based relevance measures, leading to potentially misleading interpretations [<xref ref-type="bibr" rid="ref114">114</xref>-<xref ref-type="bibr" rid="ref116">116</xref>]. Interpreting explanations primarily through the lens of existing medical literature may therefore introduce confirmation bias, whereby explanations that align with prior expectations are accepted uncritically while alternative patterns are overlooked [<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>These concerns are reinforced by the fact that most reviewed studies [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>-<xref ref-type="bibr" rid="ref54">54</xref>] did not perform any form of quantitative explainability validation. In the absence of systematic evaluation, explanatory interpretations are more vulnerable to confirmation bias and may overstate their clinical relevance. This pattern might be indicative of underlying issues relevant to clinical voice and speech research. The scarcity of qualitative explanation validation is possibly driven by the high cost and, subsequently, the scarcity of annotated, high-quality clinical voice and speech datasets. The same reason might also explain the high risk of bias for most included studies, even for otherwise methodologically sound ones.</p></sec><sec id="s4-3"><title>Complexity-Transparency Trade-Off</title><p>The reviewed studies suggest an inherent trade-off between model complexity, input representation, and explainability in clinical audio-based AI systems [<xref ref-type="bibr" rid="ref32">32</xref>]. In particular, the interpretability of model explanations is closely linked to the degree of semantic transparency in the input features. Audio representation in the literature included raw audio [<xref ref-type="bibr" rid="ref49">49</xref>], low-level acoustic descriptors (eg, jitter, shimmer, and harmonic-to-noise ratio) [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref60">60</xref>], raw time-frequency representations (eg, spectrograms and mel-spectrograms) [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref82">82</xref>], coefficient-based features (eg, MFCCs) [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref82">82</xref>], and deep neural network&#x2013;transformer-based embeddings (eg, i-vectors, wav2vec, and HuBERT [Hidden-Unit Bidirectional Encoder Representations From Transformers]) [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref60">60</xref>].</p><p>Models trained on manually extracted LLDs offer the highest degree of interpretability, as these features are directly related to established clinical biomarkers or constructs. Explanations from models trained on LLDs can be relatively easily understood using feature attribution methods such as SHAP or LIME. This alignment between model explanations and clinical knowledge contributes to the popularity of such methods in the medical AI domain.</p><p>While MFCCs are also hand-engineered features, they present interpretability challenges due to their abstract and decorrelated nature [<xref ref-type="bibr" rid="ref117">117</xref>]. The coefficients do not correspond directly to intuitive phonetic or physiological phenomena. Although recent techniques such as xDMFCC attempt to provide coefficient-level explanations across time, and Tracey et al [<xref ref-type="bibr" rid="ref117">117</xref>] sought to demystify MFCCs as vocal biomarkers by correlating them with known LLDs, the clinical relevance of these explanations remains limited. Despite efforts to position MFCCs as vocal biomarkers, they largely obscure the internal workings of deep models and contribute to the opacity of clinical audio systems [<xref ref-type="bibr" rid="ref118">118</xref>].</p><p>While raw audio exhibits energy and temporal information, spectrograms offer a more interpretable alternative. As visual time-frequency representations, they capture dynamic changes in energy that are often physically and clinically meaningful, such as vocal formants or pauses. Trained speech pathologists and clinicians can interpret spectrograms directly, making visualization-based explanations (eg, saliency maps) more accessible and potentially clinically actionable compared to MFCCs. Although visual explanations of spectrograms might inform about the time segment most important for the detection of a condition and can highlight relevant frequency bands, more granular visual explanations are typically not feasible.</p><p>Finally, state-of-the-art systems increasingly rely on transformer-based embeddings such as wav2vec and HuBERT, which yield task-optimized, high-dimensional representations learned from raw waveforms. While these embeddings deliver substantial performance gains, they are the most difficult to interpret due to high-dimensional, nonlinear abstraction from both low-level acoustic features and clinically grounded descriptors. As a result, explanations tend to be less granular, less transparent, and harder to align with clinical reasoning.</p></sec><sec id="s4-4"><title>Misalignment of Explanations With Stakeholder Needs</title><p>A major theme in the literature is the misalignment of the explanation form or modality and the needs of the end users or stakeholders. All included studies produce explanations targeted for highly technical audiences (ie, AI researchers and developers), without taking into consideration the interpretive frameworks of clinicians, patients, regulators, and policymakers. While such explanations offer insight to AI developers and researchers, critical for understanding model behavior, enhancing performance, and ensuring reliability, these explanations are often too technical, abstract, or detached from domain-specific language to be directly actionable in clinical decision-making. <xref ref-type="fig" rid="figure2">Figure 2</xref> highlights the central challenge of stakeholder&#x2013;explanation misalignment in clinical voice and speech audio AI, illustrating how diverse stakeholders hold distinct expectations and informational needs, and emphasizing the necessity of tailoring explanation strategies accordingly.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Although XAI aims to address the black-box issue of deep learning models, current XAI methods do not cater to the diverse expectations and needs of the different stakeholders. AI: artificial intelligence; XAI: explainable artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e83790_fig02.png"/></fig><p>Explanations that focus on algorithmic mechanisms poorly convey information in terms of established diagnostic criteria or clinical reasoning processes [<xref ref-type="bibr" rid="ref35">35</xref>]. Patients, with their highly limited technical and clinical knowledge, are less likely to draw relevant insights from such technical explanation modalities such as activation maps and feature attribution plots [<xref ref-type="bibr" rid="ref119">119</xref>]. Similarly, regulators and policymakers prefer explanations that offer transparent, auditable decision pathways to assess compliance, fairness, and accountability [<xref ref-type="bibr" rid="ref120">120</xref>].</p><p>This gap is partly a result of explanation design being driven primarily by XAI method availability rather than user requirements [<xref ref-type="bibr" rid="ref121">121</xref>]. Most of the reviewed literature adapts generic explainability techniques to clinical audio tasks without taking into consideration the information needs, domain expertise, or cognitive constraints of their target users. This highlights the imperative need for human-centered, context-aware design of clinical audio explainability methods in which concerned stakeholders participate and provide valuable input for suitable explanation modalities and content.</p></sec><sec id="s4-5"><title>Future Directions</title><p>In this work, we discussed several limitations of current methods and identified opportunities for advancing interpretability and explainability in clinical audio-based deep learning systems. Accordingly, this section outlines future directions and recommendations for advancing XAI specifically for clinical audio applications, as summarized in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>The future of voice and speech XAI in health care lies in the integration of perceptually aligned explanation methods, robust evaluation frameworks, and stakeholder-centered design, enabling explanations that are both faithful to model behavior and meaningful in real-world clinical practice. XAI: explainable artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e83790_fig03.png"/></fig><p>Although the literature used various explainability strategies, several techniques that are well established in related domains have rarely been adapted for clinical audio-based systems. For instance, example-based and counterexample-based explainability methods are widely used in audio-based emotion recognition [<xref ref-type="bibr" rid="ref73">73</xref>] to generate sonified explanations. Similarly, in the text-to-audio domain, AudioGenX quantifies the importance of textual tokens corresponding to generated audio using factual and counterfactual techniques [<xref ref-type="bibr" rid="ref16">16</xref>]. Integrating these approaches into clinical voice and speech analysis can enhance explanation fidelity and stakeholder interpretability.</p><p>Moreover, many of the XAI techniques surveyed in this review are originally developed for image or tabular data, underscoring the need for domain-specific approaches tailored to the temporal-spectral nature of audio signals and the heterogeneous manifestations of speech and voice disorders. This is especially important given the perceptual nature of audio. These methods should aim to map abstract representations (eg, deep transformer-based embeddings) to established clinical constructs, thereby bridging the gap between model outputs and actionable clinical insight. For example, CoughLIME (Cough Local Interpretable Model-Agnostic Explanations) [<xref ref-type="bibr" rid="ref122">122</xref>] extended LIME to generate sonified explanations for COVID-19 cough analysis. Future work should aim to develop sonified XAI that aligns with the perceptive ability of clinicians (eg, speech-language pathologists). Recent progress in large audio language models is garnering interest due to chain-of-thought reasoning and their capability to identify environmental sounds, speech characteristics, and respiratory and heart sounds [<xref ref-type="bibr" rid="ref123">123</xref>,<xref ref-type="bibr" rid="ref124">124</xref>]. These models can be used to extract salient acoustic features and present them in structured, report-style textual summaries through a stakeholder-friendly interface.</p><p>Our review also identified a lack of rigorous validation of XAI explanations. Future work should use both quantitative measures (eg, fidelity, sensitivity, perturbation-based testing, and cross-dataset explanation consistency) and qualitative, human-centered evaluation strategies (eg, expert annotation comparison and interrater agreement). For example, a recent work introduces a frequency band perturbation framework for quantitatively evaluating the faithfulness of various XAI techniques [<xref ref-type="bibr" rid="ref125">125</xref>].</p><p>Additionally, the misalignment of explanation formats with stakeholder needs highlights the importance of iterative, collaborative design processes in which AI developers engage clinicians, patients, and regulators throughout system development. Such collaboration can help ensure that explanations are understandable, clinically relevant, and operationally feasible, while supporting auditability, transparency, and accountability. For instance, the work by Pizzimenti et al [<xref ref-type="bibr" rid="ref126">126</xref>] describes a Delphi study aimed at unifying and standardizing vocal biomarker research where clinicians, statisticians, audio signal processing experts, AI researchers, and ethicists are involved in this endeavor. Such efforts highlight the importance of structured, interdisciplinary validation frameworks for clinical audio research.</p><p>Finally, throughout this systematic review, we encountered substantial difficulty in quantifying the degree of explainability and practical utility of reported methods, particularly across different stakeholder groups. Consistent with prior findings [<xref ref-type="bibr" rid="ref44">44</xref>], only a small number of studies formally evaluated the effectiveness of XAI in clinical settings. Future work should therefore develop composite evaluation frameworks that integrate objective indicators (eg, performance improvement and error detection) with subjective measures (eg, perceived clarity, trust, and usability), enabling comparison of explainability methods in terms of both model alignment and real-world clinical impact. This is achieved by addressing limitations identified under PROBAST+AI, such that future work prioritizes study designs with representative cohorts, sufficient sample sizes, and appropriate model evaluation of diagnostic and prognostic modeling. Such practices will ensure the development of methodologically sound models, such that clinical outcomes derived from explainability methods are of high quality and clinical value, thus increasing the trustworthiness of AI in clinical practice.</p></sec><sec id="s4-6"><title>Limitations</title><p>This review has several limitations that should be considered when interpreting the findings. Study selection and data extraction were conducted by a single reviewer. Although established PRISMA procedures and predefined inclusion criteria were followed, the absence of a second independent screening may increase the risk of selection bias or missed studies.</p><p>Second, conclusions regarding domain-specific explanation patterns are constrained by the methodological quality of the underlying prediction models. A substantial proportion of studies exhibited a high risk of bias, particularly in model development and evaluation. In addition, several studies relied on repeated use of the same benchmark datasets across dysarthria, PD, and AD domains, raising the risk of circular validation and inflating apparent consistency of explanation patterns. The domain-specific synthesis presented in this review is therefore intended to characterize current practice rather than establish definitive clinical explanatory signatures.</p><p>Finally, given the rapid growth of this field, studies published after the search period (February 2025) may provide additional insights and are not reflected in this review.</p></sec><sec id="s4-7"><title>Conclusions</title><p>In this systematic review, we presented current practices of explainability and interpretability for deep learning&#x2013;based voice and speech analysis in clinical care. Across 30 eligible studies, we identified a diverse set of explainability methods, which we organized into commonly adopted categories. Our findings indicate that, although explainability techniques are increasingly applied across a wide range of clinical speech and voice applications, their use is largely exploratory and rarely supported by rigorous validation. Explanations were predominantly assessed through qualitative interpretation, with limited evaluation of faithfulness, robustness, or consistency across datasets, and no explicit human-in-the-loop assessment involving clinical or regulatory stakeholders. Additionally, the quality of underlying models limits the validity of the reported explanation patterns and, subsequently, the applicability of these models for real-world clinical applications. These findings highlight the need for domain-specific, clinically grounded explainability methods, standardized validation protocols, and stakeholder-aware explanation design to support the safe and effective integration of voice and speech AI into clinical practice.</p></sec></sec></body><back><ack><p>ME was responsible for conception, design, and main manuscript preparation. JT, YB, and JMT were responsible for supplemental materials and overall formal review and editing. All authors reviewed and approved the final version of this paper. OpenAI ChatGPT 5.2 (Feb 2026) was used for manuscript editing, correcting grammatical errors, and enhancing readability in the Abstract, Methods, Results, and figure and table captions following the first round of peer reviews.</p><p>Bridge2AI-Voice Consortium: Baycrest Centre: Amanda Chao; Linda Ma; Gayathiri Rajkumar Boston Children's Hospital: Kathy Jenkins; Stacy Jo; Elizabeth Silberholz Boston Chilren's Hospital: John Costello CENIDET: Enrique Diaz-Ocampo Dalhousie University &#x0026; Vector Institute: Xijie Zeng Dalhousie University; Vector Institute: Frank Rudzicz Florida Atlantic University: Elijah Moothedan Harvard University; Massachusetts Institute of Technology: Rahul Brito Hennick Bridgepoint Hospital: Omar Ghaffar Hospital for Sick Children: Jennifer Siu; Justin Levinsky; Laurie Russell; Joyce Samuel; Lala Su Massachusetts Institute of Technology: Isaac Bevers; Kaley Jenney; Jordan Wilke; Satrajit Ghosh Mount Sinai Hospital: Julie Tu; Madeleine Zanin Mount Sinai Hospital, Sinai Health: Selina Casalino Mount Sinai Hospital, Sinai Health, Toronto; Lunenfeld-Tanenbaum Research Institute, Sinai Health, Toronto: Radhika Mahajan NIH Clinical Center, U.S. National Institutes of Health; Institute of Biomedical Engineering, University of Oxford: James Anibal Oregon Health &#x0026; Science University: David Dorr; Steven Bedrick; Abhijeet Dalal; William Hersh; LeAnn Michaels; Venkata Swarna Mukhi Talluri Sick Kids: Anna Goldenberg; Siyu Miao Simon Fraser University: Jean-Christophe B&#x00E9;lisle-Pipon; Dona Amraei; Alexander Bernier; Alden Blatter; L&#x00E9;o Cadillac; Amanda Doherty-Kirby; Renee English; Hortense Gallois; C. Gaelyn Garrett; Zoha Khawaja; Chloe Loewith; Marie-Fran&#x00E7;oise Malo; Pablo Montoya Varela; Michaela Pnacekova; Jaiden Potter; Claire Premi-Bortolotto; Luka Taylor; Gavin Victor; Claire Wilson Sinai Health: Lochana Jayachandran; Elisa Lapadula The Hastings Center: Vardit Ravitsky The Hospital for Sick Children: Evan Ng Trillium Health Partners and University of Toronto: Amer Ghavanini UT Health, Houston: Toufeeq Ahmed Syed University of Central Florida: Shaheen Awan University of Florida: Donald Bolser University of South Florida: Yael Bensoussan; Ruth Bahr; Stephanie Watts; Micah Boyer; Yassmeen Abdel-Aty; Kirollos Armosh; Ana Sophia Avila Martinez; Helena Beltran; Moroni Berrios; John Brown; Iris De Santiago; Mohamed Ebraheem; Ellie Eiseman; Mahmoud Elmahdy; Emily Evangelista; Karim Hanna; Jennifer Jain; Brenda Juan Guardela; Ayush Kalia; Megha Kalia; Cynthia Kostelnik; Alisa Krause; Genelle Leo; Vrishni Maharaj; Marian Mikael; Yosef Nafii; Tempestt Neal; Karlee Newberry; Christopher Nickel; Trevor Pharr; Parnaz Rafatjou; JM Rahman; Jillian Rossi; John Stark; Shrramana Ganesh Sudhakar; Jamie Toghranegar; Megan Urbano; Theresa Zesiewicz University of Toronto: Jordan Lerner-Ellis Vanderbilt University: Alexander Gelbard Vanderbilt University Medical Center: Maria Powell; Amy Brown; Kenneth Fletcher; Kenji Kobayashi; Amanda Peltier; Matthew Pontell; Sarah Rohde; Michael de Riesthal; Samantha Salvi Cruz; Kimberly Vinson Washington University in St. Louis: Andrea Krussel Washington University in St. Louis School of Medicine: Phillip Payne Weill Cornell Medicine: Alexandros Sigaras; Ana&#x00EF;s Rameau; Olivier Elemento; John Ramos; Jeffrey Tang; Robin Zhao; Pantelis Zisimopoulos</p></ack><notes><sec><title>Funding</title><p>This project is part of the Bridge2AI-Voice program funded by the NIH (National Institutes of Health) Common Fund #3OT2OD032720-01S2. YB is the principal investigator for this grant.</p></sec><sec><title>Data Availability</title><p>Data extraction spreadsheet and ratings of risk of bias signaling questions are available on request through the authors&#x2019; email.</p></sec></notes><fn-group><fn fn-type="conflict"><p>JMT and YB are PhD advisors for ME. YB, ME, JMT, and JT are members of the Bridge2AI-Voice Consortium.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AD</term><def><p>Alzheimer disease</p></def></def-item><def-item><term id="abb2">ADReSS</term><def><p> Alzheimer's Dementia Recognition Through Spontaneous Speech</p></def></def-item><def-item><term id="abb3">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb4">ASR</term><def><p>automatic speech recognition</p></def></def-item><def-item><term id="abb5">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb6">CoughLIME</term><def><p>Cough Local Interpretable Model-Agnostic Explanations</p></def></def-item><def-item><term id="abb7">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb8">ECG</term><def><p>electrocardiogram</p></def></def-item><def-item><term id="abb9">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb10">Eigen-CAM</term><def><p>Eigen class activation maps</p></def></def-item><def-item><term id="abb11">Grad-CAM</term><def><p>gradient-weighted class activation mapping</p></def></def-item><def-item><term id="abb12">GRBAS</term><def><p>Grade, Roughness, Breathiness, Asthenia, Strain (voice scale)</p></def></def-item><def-item><term id="abb13">HuBERT</term><def><p>Hidden-Unit Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb14">LIME</term><def><p>local interpretable model-agnostic explanations</p></def></def-item><def-item><term id="abb15">LLD</term><def><p>low-level descriptor</p></def></def-item><def-item><term id="abb16">MFCC</term><def><p>mel-frequency cepstral coefficients</p></def></def-item><def-item><term id="abb17">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb18">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb19">PC-GITA</term><def><p>Parkinson Corpus &#x2013; Grupo de Investigaci&#x00F3;n en Telecomunicaciones Aplicadas</p></def></def-item><def-item><term id="abb20">PD</term><def><p>Parkinson disease</p></def></def-item><def-item><term id="abb21">PICOTS</term><def><p>Population; Index Model; Comparator; Outcome; Timing; Setting</p></def></def-item><def-item><term id="abb22">PRISMA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb23">PROBAST+AI</term><def><p>Prediction Model Risk of Bias Assessment Tool for Artificial Intelligence</p></def></def-item><def-item><term id="abb24">SHAP</term><def><p>Shapley Additive Explanations</p></def></def-item><def-item><term id="abb25">t-SNE</term><def><p>t-distributed stochastic neighbor embedding</p></def></def-item><def-item><term id="abb26">TBI</term><def><p>traumatic brain injury</p></def></def-item><def-item><term id="abb27">UASpeech</term><def><p>Universal Access Speech</p></def></def-item><def-item><term id="abb28">VOICED</term><def><p>Voice Icarfederico II</p></def></def-item><def-item><term id="abb29">XAI</term><def><p>explainable artificial intelligence</p></def></def-item><def-item><term id="abb30">xDMFCC</term><def><p>explainable deep learning mel-frequency cepstral coefficients</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fagherazzi</surname><given-names>G</given-names> </name><name name-style="western"><surname>Fischer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ismael</surname><given-names>M</given-names> </name><name name-style="western"><surname>Despotovic</surname><given-names>V</given-names> </name></person-group><article-title>Voice for health: the use of vocal biomarkers from research to clinical practice</article-title><source>Digit Biomark</source><year>2021</year><volume>5</volume><issue>1</issue><fpage>78</fpage><lpage>88</lpage><pub-id pub-id-type="doi">10.1159/000515346</pub-id><pub-id pub-id-type="medline">34056518</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ramanarayanan</surname><given-names>V</given-names> </name><name name-style="western"><surname>Lammert</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Rowe</surname><given-names>HP</given-names> </name><name name-style="western"><surname>Quatieri</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Green</surname><given-names>JR</given-names> </name></person-group><article-title>Speech as a biomarker: opportunities, interpretability, and challenges</article-title><source>Perspect ASHA SIGs</source><year>2022</year><month>02</month><day>11</day><volume>7</volume><issue>1</issue><fpage>276</fpage><lpage>283</lpage><pub-id pub-id-type="doi">10.1044/2021_PERSP-21-00174</pub-id><pub-id pub-id-type="medline">41341425</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>C</given-names> </name></person-group><article-title>Voice disorder classification using convolutional neural network based on deep transfer learning</article-title><source>Sci Rep</source><year>2023</year><month>05</month><day>4</day><volume>13</volume><issue>1</issue><fpage>7264</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-34461-9</pub-id><pub-id pub-id-type="medline">37142759</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shaikh</surname><given-names>AAS</given-names> </name><name name-style="western"><surname>Bhargavi</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Naik</surname><given-names>GR</given-names> </name></person-group><article-title>Unraveling the complexities of pathological voice through saliency analysis</article-title><source>Comput Biol Med</source><year>2023</year><month>11</month><volume>166</volume><fpage>107566</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107566</pub-id><pub-id pub-id-type="medline">37857135</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hung</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>SH</given-names> </name></person-group><article-title>Using SincNet for learning pathological voice disorders</article-title><source>Sensors (Basel)</source><year>2022</year><month>09</month><day>2</day><volume>22</volume><issue>17</issue><fpage>6634</fpage><pub-id pub-id-type="doi">10.3390/s22176634</pub-id><pub-id pub-id-type="medline">36081092</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Eom</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Pak</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Son</surname><given-names>HY</given-names> </name></person-group><article-title>Predictions for three-month postoperative vocal recovery after thyroid surgery from spectrograms with deep neural network</article-title><source>Sensors (Basel)</source><year>2022</year><month>08</month><day>24</day><volume>22</volume><issue>17</issue><fpage>6387</fpage><pub-id pub-id-type="doi">10.3390/s22176387</pub-id><pub-id pub-id-type="medline">36080847</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mathad</surname><given-names>VC</given-names> </name><name name-style="western"><surname>Scherer</surname><given-names>N</given-names> </name><name name-style="western"><surname>Chapman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Liss</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Berisha</surname><given-names>V</given-names> </name></person-group><article-title>A deep learning algorithm for objective assessment of hypernasality in children with cleft palate</article-title><source>IEEE Trans Biomed Eng</source><year>2021</year><month>10</month><volume>68</volume><issue>10</issue><fpage>2986</fpage><lpage>2996</lpage><pub-id pub-id-type="doi">10.1109/TBME.2021.3058424</pub-id><pub-id pub-id-type="medline">33566756</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>Q</given-names> </name></person-group><article-title>Improving Alzheimer&#x2019;s disease detection for speech based on feature purification network</article-title><source>Front Public Health</source><year>2021</year><volume>9</volume><fpage>835960</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2021.835960</pub-id><pub-id pub-id-type="medline">35310782</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>LZ</given-names> </name><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name></person-group><article-title>DEMENTIA: a hybrid attention-based multimodal and multi-task learning framework with expert knowledge for Alzheimer&#x2019;s disease assessment from speech</article-title><source>IEEE J Biomed Health Inform</source><year>2025</year><month>04</month><volume>29</volume><issue>4</issue><fpage>2957</fpage><lpage>2968</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2024.3509620</pub-id><pub-id pub-id-type="medline">40030727</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HJ</given-names> </name></person-group><article-title>Exploring spectrogram-based audio classification for Parkinson&#x2019;s disease: a study on speech classification and qualitative reliability verification</article-title><source>Sensors (Basel)</source><year>2024</year><month>07</month><day>17</day><volume>24</volume><issue>14</issue><fpage>4625</fpage><pub-id pub-id-type="doi">10.3390/s24144625</pub-id><pub-id pub-id-type="medline">39066023</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lahoti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gurugubelli</surname><given-names>K</given-names> </name><name name-style="western"><surname>Arroyave</surname><given-names>JRO</given-names> </name><name name-style="western"><surname>Vuppala</surname><given-names>AK</given-names> </name></person-group><article-title>Shifted delta cepstral coefficients with RNN to improve the detection of Parkinson&#x2019;s disease from the speech</article-title><source>IC3-2022</source><year>2022</year><month>08</month><day>4</day><fpage>284</fpage><lpage>288</lpage><pub-id pub-id-type="doi">10.1145/3549206.3549258</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laguarta</surname><given-names>J</given-names> </name><name name-style="western"><surname>Subirana</surname><given-names>B</given-names> </name></person-group><article-title>Longitudinal speech biomarkers for automated Alzheimer&#x2019;s detection</article-title><source>Front Comput Sci</source><year>2021</year><volume>3</volume><fpage>624694</fpage><pub-id pub-id-type="doi">10.3389/fcomp.2021.624694</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vasquez-Correa</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Arias-Vergara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Orozco-Arroyave</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Eskofier</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klucken</surname><given-names>J</given-names> </name><name name-style="western"><surname>Noth</surname><given-names>E</given-names> </name></person-group><article-title>Multimodal assessment of Parkinson&#x2019;s disease: a deep learning approach</article-title><source>IEEE J Biomed Health Inform</source><year>2019</year><month>07</month><volume>23</volume><issue>4</issue><fpage>1618</fpage><lpage>1630</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2018.2866873</pub-id><pub-id pub-id-type="medline">30137018</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>CC</given-names> </name><etal/></person-group><article-title>Assessing schizophrenia patients through linguistic and acoustic features using deep learning techniques</article-title><source>IEEE Trans Neural Syst Rehabil Eng</source><year>2022</year><volume>30</volume><fpage>947</fpage><lpage>956</lpage><pub-id pub-id-type="doi">10.1109/TNSRE.2022.3163777</pub-id><pub-id pub-id-type="medline">35358049</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Li</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>C</given-names> </name></person-group><article-title>Improving speech depression detection using transfer learning with wav2vec 2.0 in low-resource environments</article-title><source>Sci Rep</source><year>2024</year><month>04</month><day>25</day><volume>14</volume><issue>1</issue><fpage>9543</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-60278-1</pub-id><pub-id pub-id-type="medline">38664511</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Exploring explainable AI features in the vocal biomarkers of lung disease</article-title><source>Comput Biol Med</source><year>2024</year><month>09</month><volume>179</volume><fpage>108844</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.108844</pub-id><pub-id pub-id-type="medline">38981214</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bauser</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kraus</surname><given-names>F</given-names> </name><name name-style="western"><surname>Koehler</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Voice assessment and vocal biomarkers in heart failure: a systematic review</article-title><source>Circ Heart Fail</source><year>2025</year><month>08</month><volume>18</volume><issue>8</issue><fpage>e012303</fpage><pub-id pub-id-type="doi">10.1161/CIRCHEARTFAILURE.124.012303</pub-id><pub-id pub-id-type="medline">40270235</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Geng</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Recent progress in the CUHK dysarthric speech recognition system</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2021</year><volume>29</volume><fpage>2267</fpage><lpage>2281</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2021.3091805</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Geng</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Speaker adaptation using spectro-temporal deep features for dysarthric and elderly speech recognition</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2022</year><volume>30</volume><fpage>2597</fpage><lpage>2611</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2022.3195113</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yue</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Loweimi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Christensen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Barker</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cvetkovic</surname><given-names>Z</given-names> </name></person-group><article-title>Acoustic modelling from raw source and filter components for dysarthric speech recognition</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2022</year><volume>30</volume><fpage>2968</fpage><lpage>2980</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2022.3205766</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Di Cesare</surname><given-names>MG</given-names> </name><name name-style="western"><surname>Perpetuini</surname><given-names>D</given-names> </name><name name-style="western"><surname>Cardone</surname><given-names>D</given-names> </name><name name-style="western"><surname>Merla</surname><given-names>A</given-names> </name></person-group><article-title>Assessment of voice disorders using machine learning and vocal analysis of voice samples recorded through smartphones</article-title><source>BioMedInformatics</source><year>2024</year><volume>4</volume><issue>1</issue><fpage>549</fpage><lpage>565</lpage><pub-id pub-id-type="doi">10.3390/biomedinformatics4010031</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajpurkar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>AI in health and medicine</article-title><source>Nat Med</source><year>2022</year><month>01</month><volume>28</volume><issue>1</issue><fpage>31</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01614-0</pub-id><pub-id pub-id-type="medline">35058619</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bensoussan</surname><given-names>YS</given-names> </name><name name-style="western"><surname>Rameau</surname><given-names>A</given-names> </name><name name-style="western"><surname>Elemento</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Bridge2AI-Voice: an ethically-sourced, diverse voice dataset linked to health information</article-title><source>PhysioNet</source><year>2020</year><volume>101</volume><issue>23</issue><fpage>e215</fpage><lpage>e220</lpage><pub-id pub-id-type="doi">10.13026/gzjs-0535</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Macwhinney</surname><given-names>B</given-names> </name><name name-style="western"><surname>Fromm</surname><given-names>D</given-names> </name><name name-style="western"><surname>Forbes</surname><given-names>M</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>A</given-names> </name></person-group><article-title>AphasiaBank: methods for studying discourse</article-title><source>Aphasiology</source><year>2011</year><volume>25</volume><issue>11</issue><fpage>1286</fpage><lpage>1307</lpage><pub-id pub-id-type="doi">10.1080/02687038.2011.589893</pub-id><pub-id pub-id-type="medline">22923879</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>What if speech could unlock early detection of alzheimer&#x2019;s disease?</article-title><source>The Alzheimer&#x2019;s Drug Discovery Foundation</source><access-date>2026-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.alzdiscovery.org/research-and-grants/speechdx">https://www.alzdiscovery.org/research-and-grants/speechdx</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dur&#x00E1;n</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Jongsma</surname><given-names>KR</given-names> </name></person-group><article-title>Who is afraid of black box algorithms? On the epistemological and ethical basis of trust in medical AI</article-title><source>J Med Ethics</source><year>2021</year><month>03</month><day>18</day><volume>47</volume><issue>5</issue><fpage>329</fpage><lpage>335</lpage><pub-id pub-id-type="doi">10.1136/medethics-2020-106820</pub-id><pub-id pub-id-type="medline">33737318</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poon</surname><given-names>AIF</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>JJY</given-names> </name></person-group><article-title>Opening the black box of AI-medicine</article-title><source>J Gastroenterol Hepatol</source><year>2021</year><month>03</month><volume>36</volume><issue>3</issue><fpage>581</fpage><lpage>584</lpage><pub-id pub-id-type="doi">10.1111/jgh.15384</pub-id><pub-id pub-id-type="medline">33709609</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>London</surname><given-names>AJ</given-names> </name></person-group><article-title>Artificial intelligence and black-box medical decisions: accuracy versus explainability</article-title><source>Hastings Cent Rep</source><year>2019</year><month>01</month><volume>49</volume><issue>1</issue><fpage>15</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1002/hast.973</pub-id><pub-id pub-id-type="medline">30790315</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raposo</surname><given-names>VL</given-names> </name></person-group><article-title>The fifty shades of black: about black box AI and explainability in healthcare</article-title><source>Med Law Rev</source><year>2025</year><month>01</month><day>4</day><volume>33</volume><issue>1</issue><fpage>fwaf005</fpage><pub-id pub-id-type="doi">10.1093/medlaw/fwaf005</pub-id><pub-id pub-id-type="medline">39916325</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aravazhi</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Gunasekaran</surname><given-names>P</given-names> </name><name name-style="western"><surname>Benjamin</surname><given-names>NZY</given-names> </name><etal/></person-group><article-title>The integration of artificial intelligence into clinical medicine: trends, challenges, and future directions</article-title><source>Dis Mon</source><year>2025</year><month>06</month><volume>71</volume><issue>6</issue><fpage>101882</fpage><pub-id pub-id-type="doi">10.1016/j.disamonth.2025.101882</pub-id><pub-id pub-id-type="medline">40140300</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lauritsen</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Kristensen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Olsen</surname><given-names>MV</given-names> </name><etal/></person-group><article-title>Explainable artificial intelligence model to predict acute critical illness from electronic health records</article-title><source>Nat Commun</source><year>2020</year><month>07</month><day>31</day><volume>11</volume><issue>1</issue><fpage>3852</fpage><pub-id pub-id-type="doi">10.1038/s41467-020-17431-x</pub-id><pub-id pub-id-type="medline">32737308</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudin</surname><given-names>C</given-names> </name></person-group><article-title>Stop explaining black box machine learning models for high stakes decisions and use interpretable models instead</article-title><source>Nat Mach Intell</source><year>2019</year><month>05</month><volume>1</volume><issue>5</issue><fpage>206</fpage><lpage>215</lpage><pub-id pub-id-type="doi">10.1038/s42256-019-0048-x</pub-id><pub-id pub-id-type="medline">35603010</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Teng</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Han</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name></person-group><article-title>A survey on the interpretability of deep learning in medical diagnosis</article-title><source>Multimed Syst</source><year>2022</year><volume>28</volume><issue>6</issue><fpage>2335</fpage><lpage>2355</lpage><pub-id pub-id-type="doi">10.1007/s00530-022-00960-4</pub-id><pub-id pub-id-type="medline">35789785</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marey</surname><given-names>A</given-names> </name><name name-style="western"><surname>Arjmand</surname><given-names>P</given-names> </name><name name-style="western"><surname>Alerab</surname><given-names>ADS</given-names> </name><etal/></person-group><article-title>Explainability, transparency and black box challenges of AI in radiology: impact on patient care in cardiovascular radiology</article-title><source>Egypt J Radiol Nucl Med</source><year>2024</year><volume>55</volume><issue>1</issue><fpage>183</fpage><pub-id pub-id-type="doi">10.1186/s43055-024-01356-2</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wysocki</surname><given-names>O</given-names> </name><name name-style="western"><surname>Davies</surname><given-names>JK</given-names> </name><name name-style="western"><surname>Vigo</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessing the communication gap between AI models and healthcare professionals: explainability, utility and trust in AI-driven clinical decision-making</article-title><source>Artif Intell</source><year>2023</year><month>03</month><volume>316</volume><fpage>103839</fpage><pub-id pub-id-type="doi">10.1016/j.artint.2022.103839</pub-id><pub-id pub-id-type="medline">41550460</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bienefeld</surname><given-names>N</given-names> </name><name name-style="western"><surname>Boss</surname><given-names>JM</given-names> </name><name name-style="western"><surname>L&#x00FC;thy</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Solving the explainable AI conundrum by bridging clinicians&#x2019; needs and developers&#x2019; goals</article-title><source>NPJ Digit Med</source><year>2023</year><month>05</month><day>22</day><volume>6</volume><issue>1</issue><fpage>94</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00837-4</pub-id><pub-id pub-id-type="medline">37217779</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Song</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>Y</given-names> </name></person-group><article-title>Do stakeholder needs differ? - Designing stakeholder-tailored explainable artificial intelligence (XAI) interfaces</article-title><source>Int J Hum Comput Stud</source><year>2024</year><month>01</month><volume>181</volume><fpage>103160</fpage><pub-id pub-id-type="doi">10.1016/j.ijhcs.2023.103160</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sengupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lakshminarayanan</surname><given-names>V</given-names> </name></person-group><article-title>Explainable deep learning models in medical image analysis</article-title><source>J Imaging</source><year>2020</year><month>06</month><day>20</day><volume>6</volume><issue>6</issue><fpage>52</fpage><pub-id pub-id-type="doi">10.3390/jimaging6060052</pub-id><pub-id pub-id-type="medline">34460598</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Linardatos</surname><given-names>P</given-names> </name><name name-style="western"><surname>Papastefanopoulos</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kotsiantis</surname><given-names>S</given-names> </name></person-group><article-title>Explainable AI: a review of machine learning interpretability methods</article-title><source>Entropy (Basel)</source><year>2020</year><month>12</month><day>25</day><volume>23</volume><issue>1</issue><fpage>18</fpage><pub-id pub-id-type="doi">10.3390/e23010018</pub-id><pub-id pub-id-type="medline">33375658</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gilpin</surname><given-names>LH</given-names> </name><name name-style="western"><surname>Bau</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>BZ</given-names> </name><name name-style="western"><surname>Bajwa</surname><given-names>A</given-names> </name><name name-style="western"><surname>Specter</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kagal</surname><given-names>L</given-names> </name></person-group><article-title>Explaining explanations: an overview of interpretability of machine learning</article-title><source>2018 IEEE 5th International Conference on Data Science and Advanced Analytics (DSAA</source><year>2018</year><publisher-name>IEEE</publisher-name><fpage>80</fpage><lpage>89</lpage><pub-id pub-id-type="doi">10.1109/DSAA.2018.00018</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rad</surname><given-names>P</given-names> </name></person-group><article-title>Opportunities and challenges in explainable artificial intelligence (XAI): a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 23, 2020</comment><pub-id pub-id-type="doi">10.48550/arXiv.2006.11371</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Phillips</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Hahn</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Fontana</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Yates</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>K</given-names> </name><name name-style="western"><surname>Broniatowski</surname><given-names>DA</given-names> </name><etal/></person-group><article-title>Four principles of explainable artificial intelligence</article-title><year>2021</year><month>09</month><publisher-name>National Institute of Standards and Technology</publisher-name><pub-id pub-id-type="doi">10.6028/NIST.IR.8312</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhatt</surname><given-names>U</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Explainable machine learning in deployment</article-title><source>FAT* &#x2019;20</source><year>2020</year><month>01</month><day>27</day><fpage>648</fpage><lpage>657</lpage><pub-id pub-id-type="doi">10.1145/3351095.3375624</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jung</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name></person-group><article-title>Essential properties and explanation effectiveness of explainable artificial intelligence in healthcare: a systematic review</article-title><source>Heliyon</source><year>2023</year><month>05</month><volume>9</volume><issue>5</issue><fpage>e16110</fpage><pub-id pub-id-type="doi">10.1016/j.heliyon.2023.e16110</pub-id><pub-id pub-id-type="medline">37234618</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nannini</surname><given-names>L</given-names> </name><name name-style="western"><surname>Balayn</surname><given-names>A</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>AL</given-names> </name></person-group><article-title>Explainability in AI policies: a critical review of communications, reports, regulations, and standards in the EU, US, and UK</article-title><source>FAccT &#x2019;23</source><year>2023</year><month>06</month><day>12</day><fpage>1198</fpage><lpage>1212</lpage><pub-id pub-id-type="doi">10.1145/3593013.3594074</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="web"><article-title>UK GDPR guidance and resources</article-title><source>Information Commissioner&#x2019;s Office</source><access-date>2026-05-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/">https://ico.org.uk/for-organisations/uk-gdpr-guidance-and-resources/</ext-link></comment></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rojas</surname><given-names>F</given-names> </name><name name-style="western"><surname>Madanian</surname><given-names>S</given-names> </name><name name-style="western"><surname>Templeton</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Poellabauer</surname><given-names>C</given-names> </name><name name-style="western"><surname>Schneider</surname><given-names>SL</given-names> </name></person-group><article-title>Exploring deep learning and grad-CAM for speech-based detection of mild traumatic brain injury</article-title><source>2024 IEEE International Conference on Big Data (BigData)</source><year>2024</year><publisher-name>IEEE</publisher-name><fpage>6108</fpage><lpage>6116</lpage><pub-id pub-id-type="doi">10.1109/BigData62323.2024.10825360</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>S</given-names> </name><name name-style="western"><surname>He</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Sch-net: a deep learning architecture for automatic detection of schizophrenia</article-title><source>Biomed Eng Online</source><year>2021</year><month>08</month><day>3</day><volume>20</volume><issue>1</issue><fpage>75</fpage><pub-id pub-id-type="doi">10.1186/s12938-021-00915-2</pub-id><pub-id pub-id-type="medline">34344372</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name></person-group><article-title>Individual-independent and cross-language detection of speech disfluencies in stuttering based on multi-adversarial tasks and self-training</article-title><source>Biomed Signal Process Control</source><year>2025</year><month>02</month><volume>100</volume><fpage>107051</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2024.107051</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Patil</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Purohit</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Residual neural network precisely quantifies dysarthria severity-level based on short-duration speech segments</article-title><source>Neural Netw</source><year>2021</year><month>07</month><volume>139</volume><fpage>105</fpage><lpage>117</lpage><pub-id pub-id-type="doi">10.1016/j.neunet.2021.02.008</pub-id><pub-id pub-id-type="medline">33684609</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schultebraucks</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yadav</surname><given-names>V</given-names> </name><name name-style="western"><surname>Shalev</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Bonanno</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Galatzer-Levy</surname><given-names>IR</given-names> </name></person-group><article-title>Deep learning-based classification of posttraumatic stress disorder and depression following trauma utilizing visual and auditory markers of arousal and mood</article-title><source>Psychol Med</source><year>2022</year><month>04</month><volume>52</volume><issue>5</issue><fpage>957</fpage><lpage>967</lpage><pub-id pub-id-type="doi">10.1017/S0033291720002718</pub-id><pub-id pub-id-type="medline">32744201</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Chander</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nepal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Buck</surname><given-names>B</given-names> </name><name name-style="western"><surname>Pakhomov</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The power of speech in the wild: discriminative power of daily voice diaries in understanding auditory verbal hallucinations using deep learning</article-title><source>Proc ACM Interact Mob Wearable Ubiquitous Technol</source><year>2023</year><volume>7</volume><issue>3</issue><fpage>133</fpage><pub-id pub-id-type="doi">10.1145/3610890</pub-id><pub-id pub-id-type="medline">38737573</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ditthapron</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lammert</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Agu</surname><given-names>EO</given-names> </name></person-group><article-title>Continuous TBI monitoring from spontaneous speech using parametrized sinc filters and a cascading GRU</article-title><source>IEEE J Biomed Health Inform</source><year>2022</year><month>07</month><volume>26</volume><issue>7</issue><fpage>3517</fpage><lpage>3528</lpage><pub-id pub-id-type="doi">10.1109/JBHI.2022.3158840</pub-id><pub-id pub-id-type="medline">35290191</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guti&#x00E9;rrez-Seraf&#x00ED;n</surname><given-names>B</given-names> </name><name name-style="western"><surname>Andreu-Perez</surname><given-names>J</given-names> </name><name name-style="western"><surname>P&#x00E9;rez-Espinosa</surname><given-names>H</given-names> </name><name name-style="western"><surname>Paulmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>W</given-names> </name></person-group><article-title>Toward assessment of human voice biomarkers of brain lesions through explainable deep learning</article-title><source>Biomed Signal Process Control</source><year>2024</year><month>01</month><volume>87</volume><fpage>105457</fpage><pub-id pub-id-type="doi">10.1016/j.bspc.2023.105457</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name></person-group><article-title>WNSA-net: an axial-attention-based network for schizophrenia detection using wideband and narrowband spectrograms</article-title><source>IEEE/ACM Trans Audio Speech Lang Process</source><year>2023</year><volume>31</volume><fpage>721</fpage><lpage>733</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2022.3209941</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joshy</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Rajan</surname><given-names>R</given-names> </name></person-group><article-title>Automated dysarthria severity classification: a study on acoustic features and deep learning techniques</article-title><source>IEEE Trans Neural Syst Rehabil Eng</source><year>2022</year><volume>30</volume><fpage>1147</fpage><lpage>1157</lpage><pub-id pub-id-type="doi">10.1109/TNSRE.2022.3169814</pub-id><pub-id pub-id-type="medline">35452390</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lau</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Huntly</surname><given-names>M</given-names> </name><name name-style="western"><surname>Morgan</surname><given-names>N</given-names> </name><name name-style="western"><surname>Iyenoma</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>B</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bashford</surname><given-names>T</given-names> </name></person-group><article-title>Interpreting pretrained speech models for automatic speech assessment of voice disorders</article-title><source>Artif Intell Healthcare</source><year>2024</year><volume>14975</volume><fpage>59</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-67278-1_5</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abderrazek</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fredouille</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ghio</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lalain</surname><given-names>M</given-names> </name><name name-style="western"><surname>Meunier</surname><given-names>C</given-names> </name><name name-style="western"><surname>Woisard</surname><given-names>V</given-names> </name></person-group><article-title>Interpreting deep representations of phonetic features via neuro-based concept detector: application to speech disorders due to head and neck cancer</article-title><source>IEEE/ACM Trans Audio, Speech, Lang Process</source><year>2023</year><volume>31</volume><fpage>200</fpage><lpage>214</lpage><pub-id pub-id-type="doi">10.1109/TASLP.2022.3221039</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Park</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YO</given-names> </name></person-group><article-title>Classification of laryngeal diseases including laryngeal cancer, benign mucosal disease, and vocal cord paralysis by artificial intelligence using voice analysis</article-title><source>Sci Rep</source><year>2024</year><month>04</month><day>23</day><volume>14</volume><issue>1</issue><fpage>9297</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-58817-x</pub-id><pub-id pub-id-type="medline">38654036</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>GW</given-names> </name><name name-style="western"><surname>Bong</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yoo</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HK</given-names> </name></person-group><article-title>Deep-learning-based detection of infants with autism spectrum disorder using auto-encoder feature representation</article-title><source>Sensors (Basel)</source><year>2020</year><month>11</month><day>26</day><volume>20</volume><issue>23</issue><fpage>6762</fpage><pub-id pub-id-type="doi">10.3390/s20236762</pub-id><pub-id pub-id-type="medline">33256061</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tjoa</surname><given-names>E</given-names> </name><name name-style="western"><surname>Guan</surname><given-names>C</given-names> </name></person-group><article-title>A survey on explainable artificial intelligence (XAI): toward medical XAI</article-title><source>IEEE Trans Neural Netw Learn Syst</source><year>2021</year><month>11</month><volume>32</volume><issue>11</issue><fpage>4793</fpage><lpage>4813</lpage><pub-id pub-id-type="doi">10.1109/TNNLS.2020.3027314</pub-id><pub-id pub-id-type="medline">33079674</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chaddad</surname><given-names>A</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bouridane</surname><given-names>A</given-names> </name></person-group><article-title>Survey of explainable AI techniques in healthcare</article-title><source>Sensors (Basel)</source><year>2023</year><month>01</month><day>5</day><volume>23</volume><issue>2</issue><fpage>634</fpage><pub-id pub-id-type="doi">10.3390/s23020634</pub-id><pub-id pub-id-type="medline">36679430</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sadeghi</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Alizadehsani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cifci</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>A review of explainable artificial intelligence in healthcare</article-title><source>Comput Electr Eng</source><year>2024</year><month>08</month><volume>118</volume><fpage>109370</fpage><pub-id pub-id-type="doi">10.1016/j.compeleceng.2024.109370</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mienye</surname><given-names>ID</given-names> </name><name name-style="western"><surname>Obaido</surname><given-names>G</given-names> </name><name name-style="western"><surname>Jere</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A survey of explainable artificial intelligence in healthcare: concepts, applications, and challenges</article-title><source>Inf Med Unlocked</source><year>2024</year><volume>51</volume><fpage>101587</fpage><pub-id pub-id-type="doi">10.1016/j.imu.2024.101587</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sheu</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Pardeshi</surname><given-names>MS</given-names> </name></person-group><article-title>A survey on medical explainable AI (XAI): recent progress, explainability approach, human interaction and scoring system</article-title><source>Sensors (Basel)</source><year>2022</year><month>10</month><day>21</day><volume>22</volume><issue>20</issue><fpage>8068</fpage><pub-id pub-id-type="doi">10.3390/s22208068</pub-id><pub-id pub-id-type="medline">36298417</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van der Velden</surname><given-names>BHM</given-names> </name><name name-style="western"><surname>Kuijf</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Gilhuijs</surname><given-names>KGA</given-names> </name><name name-style="western"><surname>Viergever</surname><given-names>MA</given-names> </name></person-group><article-title>Explainable artificial intelligence (XAI) in deep learning-based medical image analysis</article-title><source>Med Image Anal</source><year>2022</year><month>07</month><volume>79</volume><fpage>102470</fpage><pub-id pub-id-type="doi">10.1016/j.media.2022.102470</pub-id><pub-id pub-id-type="medline">35576821</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muhammad</surname><given-names>D</given-names> </name><name name-style="western"><surname>Bendechache</surname><given-names>M</given-names> </name></person-group><article-title>Unveiling the black box: a systematic review of explainable artificial intelligence in medical image analysis</article-title><source>Comput Struct Biotechnol J</source><year>2024</year><month>12</month><volume>24</volume><fpage>542</fpage><lpage>560</lpage><pub-id pub-id-type="doi">10.1016/j.csbj.2024.08.005</pub-id><pub-id pub-id-type="medline">39252818</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Caterson</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lewin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Williamson</surname><given-names>E</given-names> </name></person-group><article-title>The application of explainable artificial intelligence (XAI) in electronic health record research: a scoping review</article-title><source>Digit HEALTH</source><year>2024</year><volume>10</volume><fpage>20552076241272657</fpage><pub-id pub-id-type="doi">10.1177/20552076241272657</pub-id><pub-id pub-id-type="medline">39493635</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salih</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Galazzo</surname><given-names>IB</given-names> </name><name name-style="western"><surname>Gkontra</surname><given-names>P</given-names> </name><etal/></person-group><article-title>A review of evaluation approaches for explainable AI with applications in cardiology</article-title><source>Artif Intell Rev</source><year>2024</year><volume>57</volume><issue>9</issue><fpage>240</fpage><pub-id pub-id-type="doi">10.1007/s10462-024-10852-w</pub-id><pub-id pub-id-type="medline">39132011</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joyce</surname><given-names>DW</given-names> </name><name name-style="western"><surname>Kormilitzin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Cipriani</surname><given-names>A</given-names> </name></person-group><article-title>Explainable artificial intelligence for mental health through transparency and interpretability for understandability</article-title><source>NPJ Digit Med</source><year>2023</year><month>01</month><day>18</day><volume>6</volume><issue>1</issue><fpage>6</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00751-9</pub-id><pub-id pub-id-type="medline">36653524</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kindermans</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Hooker</surname><given-names>S</given-names> </name><name name-style="western"><surname>Adebayo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Alber</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sch&#x00FC;tt</surname><given-names>KT</given-names> </name><name name-style="western"><surname>D&#x00E4;hne</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The (un)reliability of saliency methods</article-title><source>Explainable AI: Interpreting, Explaining and Visualizing Deep</source><year>2022</year><publisher-name>Springer-Verlag</publisher-name><fpage>267</fpage><lpage>280</lpage><pub-id pub-id-type="doi">10.1007/978-3-030-28954-6_14</pub-id></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adebayo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gilmer</surname><given-names>J</given-names> </name><name name-style="western"><surname>Muelly</surname><given-names>M</given-names> </name><name name-style="western"><surname>Goodfellow</surname><given-names>I</given-names> </name><name name-style="western"><surname>Hardt</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>B</given-names> </name></person-group><article-title>Sanity checks for saliency maps</article-title><source>Proc 32nd Int Conf Neural Inf Proc Syst</source><year>2018</year><access-date>2026-05-20</access-date><fpage>9525</fpage><lpage>9536</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2018/file/294a8ed24b1ad22ec2e7efea049b8737-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2018/file/294a8ed24b1ad22ec2e7efea049b8737-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>BY</given-names> </name></person-group><article-title>Towards relatable explainable AI with the perceptual process</article-title><source>CHI &#x2019;22</source><year>2022</year><month>04</month><day>29</day><fpage>181</fpage><pub-id pub-id-type="doi">10.1145/3491102.3501826</pub-id></nlm-citation></ref><ref id="ref74"><label>74</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Petti</surname><given-names>U</given-names> </name><name name-style="western"><surname>Nyrup</surname><given-names>R</given-names> </name><name name-style="western"><surname>Skopek</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Korhonen</surname><given-names>A</given-names> </name></person-group><article-title>Ethical considerations in the early detection of Alzheimer&#x2019;s disease using speech and AI</article-title><source>FAccT &#x2019;23</source><year>2023</year><month>06</month><day>12</day><fpage>1062</fpage><lpage>1075</lpage><pub-id pub-id-type="doi">10.1145/3593013.3594063</pub-id></nlm-citation></ref><ref id="ref75"><label>75</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>P</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hamdulla</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>D</given-names> </name></person-group><article-title>Reliable visualization for deep speaker recognition</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 12, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2204.03852</pub-id></nlm-citation></ref><ref id="ref76"><label>76</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ouzzani</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hammady</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fedorowicz</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Elmagarmid</surname><given-names>A</given-names> </name></person-group><article-title>Rayyan-a web and mobile app for systematic reviews</article-title><source>Syst Rev</source><year>2016</year><month>12</month><day>5</day><volume>5</volume><issue>1</issue><fpage>210</fpage><pub-id pub-id-type="doi">10.1186/s13643-016-0384-4</pub-id><pub-id pub-id-type="medline">27919275</pub-id></nlm-citation></ref><ref id="ref77"><label>77</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Speith</surname><given-names>T</given-names> </name></person-group><article-title>A review of taxonomies of explainable artificial intelligence (XAI) methods</article-title><source>FAccT &#x2019;22</source><year>2022</year><month>06</month><day>21</day><fpage>2239</fpage><lpage>2250</lpage><pub-id pub-id-type="doi">10.1145/3531146.3534639</pub-id></nlm-citation></ref><ref id="ref78"><label>78</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Samek</surname><given-names>W</given-names> </name><name name-style="western"><surname>Montavon</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lapuschkin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Anders</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Muller</surname><given-names>KR</given-names> </name></person-group><article-title>Explaining deep neural networks and beyond: a review of methods and applications</article-title><source>Proc IEEE</source><year>2021</year><volume>109</volume><issue>3</issue><fpage>247</fpage><lpage>278</lpage><pub-id pub-id-type="doi">10.1109/JPROC.2021.3060483</pub-id></nlm-citation></ref><ref id="ref79"><label>79</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guidotti</surname><given-names>R</given-names> </name><name name-style="western"><surname>Monreale</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ruggieri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Turini</surname><given-names>F</given-names> </name><name name-style="western"><surname>Giannotti</surname><given-names>F</given-names> </name><name name-style="western"><surname>Pedreschi</surname><given-names>D</given-names> </name></person-group><article-title>A survey of methods for explaining black box models</article-title><source>ACM Comput Surv</source><year>2019</year><month>09</month><day>30</day><volume>51</volume><issue>5</issue><fpage>1</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1145/3236009</pub-id></nlm-citation></ref><ref id="ref80"><label>80</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Damen</surname><given-names>JAA</given-names> </name><name name-style="western"><surname>Kaul</surname><given-names>T</given-names> </name><etal/></person-group><article-title>PROBAST+AI: an updated quality, risk of bias, and applicability assessment tool for prediction models using regression or artificial intelligence methods</article-title><source>BMJ</source><year>2025</year><month>03</month><day>24</day><volume>388</volume><fpage>e082505</fpage><pub-id pub-id-type="doi">10.1136/bmj-2024-082505</pub-id><pub-id pub-id-type="medline">40127903</pub-id></nlm-citation></ref><ref id="ref81"><label>81</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wolff</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><etal/></person-group><article-title>PROBAST: a tool to assess the risk of bias and applicability of prediction model studies</article-title><source>Ann Intern Med</source><year>2019</year><month>01</month><day>1</day><volume>170</volume><issue>1</issue><fpage>51</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.7326/M18-1376</pub-id><pub-id pub-id-type="medline">30596875</pub-id></nlm-citation></ref><ref id="ref82"><label>82</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herath</surname><given-names>H</given-names> </name><name name-style="western"><surname>Weraniyagoda</surname><given-names>W</given-names> </name><name name-style="western"><surname>Rajapaksha</surname><given-names>RTM</given-names> </name><name name-style="western"><surname>Wijesekara</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sudheera</surname><given-names>KLK</given-names> </name><name name-style="western"><surname>Chong</surname><given-names>PHJ</given-names> </name></person-group><article-title>Automatic assessment of aphasic speech sensed by audio sensors for classification into aphasia severity levels to recommend speech therapies</article-title><source>Sensors (Basel)</source><year>2022</year><month>09</month><day>14</day><volume>22</volume><issue>18</issue><fpage>6966</fpage><pub-id pub-id-type="doi">10.3390/s22186966</pub-id><pub-id pub-id-type="medline">36146316</pub-id></nlm-citation></ref><ref id="ref83"><label>83</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Martinc</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pollak</surname><given-names>S</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Martinc</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pollak</surname><given-names>S</given-names> </name></person-group><article-title>Tackling the ADReSS challenge: a multimodal approach to the automated recognition of Alzheimer&#x2019;s dementia</article-title><conf-name>Interspeech 2020</conf-name><conf-date>Oct 25-29, 2020</conf-date><conf-loc>Shanghai, China</conf-loc><fpage>2157</fpage><lpage>2161</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2020-2202</pub-id></nlm-citation></ref><ref id="ref84"><label>84</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Becker</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Boller</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lopez</surname><given-names>OL</given-names> </name><name name-style="western"><surname>Saxton</surname><given-names>J</given-names> </name><name name-style="western"><surname>McGonigle</surname><given-names>KL</given-names> </name></person-group><article-title>The natural history of Alzheimer&#x2019;s disease. Description of study cohort and accuracy of diagnosis</article-title><source>Arch Neurol</source><year>1994</year><month>06</month><volume>51</volume><issue>6</issue><fpage>585</fpage><lpage>594</lpage><pub-id pub-id-type="doi">10.1001/archneur.1994.00540180063015</pub-id><pub-id pub-id-type="medline">8198470</pub-id></nlm-citation></ref><ref id="ref85"><label>85</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Grill</surname><given-names>P</given-names> </name><name name-style="western"><surname>Tu&#x010D;kov&#x00E1;</surname><given-names>J</given-names> </name></person-group><article-title>Speech databases of typical children and children with SLI</article-title><source>PLoS One</source><year>2016</year><volume>11</volume><issue>3</issue><fpage>e0150365</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0150365</pub-id><pub-id pub-id-type="medline">26963508</pub-id></nlm-citation></ref><ref id="ref86"><label>86</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chapman</surname><given-names>KL</given-names> </name><name name-style="western"><surname>Baylis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Trost-Cardamone</surname><given-names>J</given-names> </name><etal/></person-group><article-title>The Americleft Speech Project: a training and reliability study</article-title><source>Cleft Palate Craniofac J</source><year>2016</year><month>01</month><volume>53</volume><issue>1</issue><fpage>93</fpage><lpage>108</lpage><pub-id pub-id-type="doi">10.1597/14-027</pub-id><pub-id pub-id-type="medline">25531738</pub-id></nlm-citation></ref><ref id="ref87"><label>87</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Javid</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Gurugubelli</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vuppala</surname><given-names>AK</given-names> </name></person-group><article-title>Single frequency filter bank based long-term average spectra for hypernasality detection and assessment in cleft lip and palate speech</article-title><source>ICASSP 2020 - 2020 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source><year>2020</year><publisher-name>IEEE</publisher-name><fpage>6754</fpage><lpage>6758</lpage><pub-id pub-id-type="doi">10.1109/ICASSP40776.2020.9054684</pub-id></nlm-citation></ref><ref id="ref88"><label>88</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Ke</surname><given-names>X</given-names> </name><name name-style="western"><surname>Mak</surname><given-names>MW</given-names> </name><etal/></person-group><article-title>Speaker-turn aware diarization for speech-based cognitive assessments</article-title><source>Front Neurosci</source><year>2023</year><volume>17</volume><fpage>1351848</fpage><pub-id pub-id-type="doi">10.3389/fnins.2023.1351848</pub-id><pub-id pub-id-type="medline">38292896</pub-id></nlm-citation></ref><ref id="ref89"><label>89</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schultebraucks</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shalev</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Michopoulos</surname><given-names>V</given-names> </name><etal/></person-group><article-title>A validated predictive algorithm of post-traumatic stress course following emergency department admission after a traumatic stressor</article-title><source>Nat Med</source><year>2020</year><month>07</month><volume>26</volume><issue>7</issue><fpage>1084</fpage><lpage>1088</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-0951-z</pub-id><pub-id pub-id-type="medline">32632194</pub-id></nlm-citation></ref><ref id="ref90"><label>90</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zou</surname><given-names>B</given-names> </name><name name-style="western"><surname>Han</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Semi-structural interview-based chinese multimodal depression corpus towards automatic preliminary screening of depressive disorders</article-title><source>IEEE Trans Affective Comput</source><year>2023</year><volume>14</volume><issue>4</issue><fpage>2823</fpage><lpage>2838</lpage><pub-id pub-id-type="doi">10.1109/TAFFC.2022.3181210</pub-id></nlm-citation></ref><ref id="ref91"><label>91</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gratch</surname><given-names>J</given-names> </name><name name-style="western"><surname>Artstein</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lucas</surname><given-names>G</given-names> </name><etal/></person-group><article-title>The distress analysis interview corpus of human and computer interviews</article-title><source>ELRA</source><year>2014</year><fpage>3123</fpage><lpage>3128</lpage><pub-id pub-id-type="doi">10.63317/3o7bccg9xequ</pub-id></nlm-citation></ref><ref id="ref92"><label>92</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Yeung</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>EHY</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>PCM</given-names> </name><name name-style="western"><surname>Levow</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>H</given-names> </name></person-group><article-title>Development of a cantonese dysarthric speech corpus</article-title><source>Proc Interspeech 2015</source><year>2015</year><fpage>329</fpage><lpage>333</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2015-149</pub-id></nlm-citation></ref><ref id="ref93"><label>93</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rudzicz</surname><given-names>F</given-names> </name><name name-style="western"><surname>Namasivayam</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Wolff</surname><given-names>T</given-names> </name></person-group><article-title>The TORGO database of acoustic and articulatory speech from speakers with dysarthria</article-title><source>Lang Resour Eval</source><year>2012</year><month>12</month><volume>46</volume><issue>4</issue><fpage>523</fpage><lpage>541</lpage><pub-id pub-id-type="doi">10.1007/s10579-011-9145-0</pub-id></nlm-citation></ref><ref id="ref94"><label>94</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hasegawa-Johnson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Perlman</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Dysarthric speech database for universal access research</article-title><source>Proc Interspeech 2008</source><year>2008</year><fpage>1741</fpage><lpage>1744</lpage><pub-id pub-id-type="doi">10.21437/Interspeech.2008-480</pub-id></nlm-citation></ref><ref id="ref95"><label>95</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bayerl</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wolff von Gudenberg</surname><given-names>A</given-names> </name><name name-style="western"><surname>H&#x00F6;nig</surname><given-names>F</given-names> </name><name name-style="western"><surname>Noeth</surname><given-names>E</given-names> </name><name name-style="western"><surname>Riedhammer</surname><given-names>K</given-names> </name></person-group><article-title>KSoF: the kassel state of fluency dataset &#x2013; a therapy centered dataset of stuttering</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 16, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.05383</pub-id></nlm-citation></ref><ref id="ref96"><label>96</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Lea</surname><given-names>C</given-names> </name><name name-style="western"><surname>Mitra</surname><given-names>V</given-names> </name><name name-style="western"><surname>Joshi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kajarekar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bigham</surname><given-names>JP</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Bigham</surname><given-names>JP</given-names> </name></person-group><article-title>SEP-28k: a dataset for stuttering event detection from podcasts with people who stutter</article-title><source>ICASSP 2021 - 2021 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP</source><year>2021</year><fpage>6798</fpage><lpage>6802</lpage><pub-id pub-id-type="doi">10.1109/ICASSP39728.2021.9413520</pub-id></nlm-citation></ref><ref id="ref97"><label>97</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Larnel</surname><given-names>LF</given-names> </name><name name-style="western"><surname>Gauvain</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Eskenazi</surname><given-names>M</given-names> </name></person-group><article-title>BREF, a large vocabulary spoken corpus for french</article-title><source>Proc 2nd Eur Conf Speech Commun Technol (Eurospeech 1991)</source><year>1991</year><fpage>505</fpage><lpage>508</lpage><pub-id pub-id-type="doi">10.21437/Eurospeech.1991-126</pub-id></nlm-citation></ref><ref id="ref98"><label>98</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Woisard</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ast&#x00E9;sano</surname><given-names>C</given-names> </name><name name-style="western"><surname>Balaguer</surname><given-names>M</given-names> </name><etal/></person-group><article-title>C2SI corpus: a database of speech disorder productions to assess intelligibility and quality of life in head and neck cancers</article-title><source>Lang Resour Eval</source><year>2021</year><month>03</month><volume>55</volume><issue>1</issue><fpage>173</fpage><lpage>190</lpage><pub-id pub-id-type="doi">10.1007/s10579-020-09496-3</pub-id></nlm-citation></ref><ref id="ref99"><label>99</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Orozco-Arroyave</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Arias-Londo&#x00F1;o</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Vargas-Bonilla</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Gonz&#x00E1;lez-R&#x00E1;tiva</surname><given-names>MC</given-names> </name><name name-style="western"><surname>N&#x00F6;th</surname><given-names>E</given-names> </name></person-group><article-title>New spanish speech corpus database for the analysis of people suffering from Parkinson&#x2019;s disease</article-title><source>ELRA</source><year>2014</year><fpage>342</fpage><lpage>347</lpage><pub-id pub-id-type="doi">10.63317/3c643i6hcfzo</pub-id></nlm-citation></ref><ref id="ref100"><label>100</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Skodda</surname><given-names>S</given-names> </name><name name-style="western"><surname>Visser</surname><given-names>W</given-names> </name><name name-style="western"><surname>Schlegel</surname><given-names>U</given-names> </name></person-group><article-title>Gender-related patterns of dysprosody in Parkinson disease and correlation between speech variables and motor symptoms</article-title><source>J Voice</source><year>2011</year><month>01</month><volume>25</volume><issue>1</issue><fpage>76</fpage><lpage>82</lpage><pub-id pub-id-type="doi">10.1016/j.jvoice.2009.07.005</pub-id></nlm-citation></ref><ref id="ref101"><label>101</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rusz</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cmejla</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ruzickova</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ruzicka</surname><given-names>E</given-names> </name></person-group><article-title>Quantitative acoustic measurements for characterization of speech and voice disorders in early untreated Parkinson&#x2019;s disease</article-title><source>J Acoust Soc Am</source><year>2011</year><month>01</month><day>1</day><volume>129</volume><issue>1</issue><fpage>350</fpage><lpage>367</lpage><pub-id pub-id-type="doi">10.1121/1.3514381</pub-id></nlm-citation></ref><ref id="ref102"><label>102</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coelho</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Youse</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Le</surname><given-names>KN</given-names> </name></person-group><article-title>Conversational discourse in closed-head-injured and non-brain-injured adults</article-title><source>Aphasiology</source><year>2002</year><month>04</month><volume>16</volume><issue>4-6</issue><fpage>659</fpage><lpage>672</lpage><pub-id pub-id-type="doi">10.1080/02687030244000275</pub-id></nlm-citation></ref><ref id="ref103"><label>103</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bhat</surname><given-names>C</given-names> </name><name name-style="western"><surname>Kopparapu</surname><given-names>SK</given-names> </name></person-group><article-title>FEMH voice data challenge: voice disorder detection and classification using acoustic descriptors</article-title><source>2018 IEEE International Conference on Big Data (Big Data)</source><year>2018</year><publisher-name>IEEE</publisher-name><fpage>5233</fpage><lpage>5237</lpage><pub-id pub-id-type="doi">10.1109/BigData.2018.8622543</pub-id></nlm-citation></ref><ref id="ref104"><label>104</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>P&#x00FC;tzer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Barry</surname><given-names>WJ</given-names> </name></person-group><article-title>Saarbruecken voice database</article-title><year>2008</year><publisher-name>Zenodo</publisher-name><pub-id pub-id-type="doi">10.5281/zenodo.16874898</pub-id></nlm-citation></ref><ref id="ref105"><label>105</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cesari</surname><given-names>U</given-names> </name><name name-style="western"><surname>De Pietro</surname><given-names>G</given-names> </name><name name-style="western"><surname>Marciano</surname><given-names>E</given-names> </name><name name-style="western"><surname>Niri</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sannino</surname><given-names>G</given-names> </name><name name-style="western"><surname>Verde</surname><given-names>L</given-names> </name></person-group><article-title>A new database of healthy and pathological voices</article-title><source>Comput Electr Eng</source><year>2018</year><month>05</month><volume>68</volume><fpage>310</fpage><lpage>321</lpage><pub-id pub-id-type="doi">10.1016/j.compeleceng.2018.04.008</pub-id></nlm-citation></ref><ref id="ref106"><label>106</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Selvaraju</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Cogswell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vedantam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Parikh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Batra</surname><given-names>D</given-names> </name></person-group><article-title>Grad-CAM: visual explanations from deep networks via gradient-based localization</article-title><source>Int J Comput Vis</source><year>2020</year><month>02</month><volume>128</volume><issue>2</issue><fpage>336</fpage><lpage>359</lpage><pub-id pub-id-type="doi">10.1007/s11263-019-01228-7</pub-id></nlm-citation></ref><ref id="ref107"><label>107</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Simonyan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vedaldi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zisserman</surname><given-names>A</given-names> </name></person-group><article-title>Deep inside convolutional networks: visualising image classification models and saliency maps</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 19, 2014</comment><pub-id pub-id-type="doi">10.48550/arXiv.1312.6034</pub-id></nlm-citation></ref><ref id="ref108"><label>108</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Springenberg</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Dosovitskiy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Brox</surname><given-names>T</given-names> </name><name name-style="western"><surname>Riedmiller</surname><given-names>M</given-names> </name></person-group><article-title>Striving for simplicity: the all convolutional net</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 13, 2015</comment><pub-id pub-id-type="doi">10.48550/arXiv.1412.6806</pub-id></nlm-citation></ref><ref id="ref109"><label>109</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lundberg</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SI</given-names> </name></person-group><article-title>A unified approach to interpreting model predictions</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 25, 2017</comment><pub-id pub-id-type="doi">10.48550/arXiv.1705.07874</pub-id></nlm-citation></ref><ref id="ref110"><label>110</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maaten</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hinton</surname><given-names>G</given-names> </name></person-group><article-title>Visualizing Data using t-SNE</article-title><source>J Mach Learn Res</source><year>2008</year><access-date>2026-05-20</access-date><volume>9</volume><issue>86</issue><fpage>2579</fpage><lpage>2605</lpage><comment><ext-link ext-link-type="uri" xlink:href="http://jmlr.org/papers/v9/vandermaaten08a.html">http://jmlr.org/papers/v9/vandermaaten08a.html</ext-link></comment></nlm-citation></ref><ref id="ref111"><label>111</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Muhammad</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Yeasin</surname><given-names>M</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Muhammad</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Yeasin</surname><given-names>M</given-names> </name></person-group><article-title>Eigen-CAM: class activation map using principal components</article-title><source>2020 International Joint Conference on Neural Networks (IJCNN)</source><year>2020</year><publisher-name>IEEE</publisher-name><fpage>1</fpage><lpage>7</lpage><pub-id pub-id-type="doi">10.1109/IJCNN48605.2020.9206626</pub-id></nlm-citation></ref><ref id="ref112"><label>112</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ribeiro</surname><given-names>MT</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Guestrin</surname><given-names>C</given-names> </name></person-group><article-title>&#x201C;Why should I trust you?&#x201D;: Explaining the predictions of any classifier</article-title><source>Proc 22nd ACM SIGKDD Int Conf Knowl Discovery Data Min</source><year>2016</year><fpage>1135</fpage><lpage>1144</lpage><pub-id pub-id-type="doi">10.1145/2939672.2939778</pub-id></nlm-citation></ref><ref id="ref113"><label>113</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>L</given-names> </name></person-group><article-title>Large language model for medical images: a survey of taxonomy, systematic review, and future trends</article-title><source>Big Data Min Anal</source><year>2025</year><volume>8</volume><issue>2</issue><fpage>496</fpage><lpage>517</lpage><pub-id pub-id-type="doi">10.26599/BDMA.2024.9020090</pub-id></nlm-citation></ref><ref id="ref114"><label>114</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kong</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Kamalika</surname><given-names>C</given-names> </name><name name-style="western"><surname>Stefanie</surname><given-names>J</given-names> </name><name name-style="western"><surname>Le</surname><given-names>S</given-names> </name><name name-style="western"><surname>Csaba</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Sivan</surname><given-names>S</given-names> </name></person-group><article-title>Rethinking attention-model explainability through faithfulness violation test</article-title><source>PMLR</source><year>2022</year><access-date>2026-05-21</access-date><volume>162</volume><fpage>13807</fpage><lpage>13824</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v162/liu22i.html">https://proceedings.mlr.press/v162/liu22i.html</ext-link></comment></nlm-citation></ref><ref id="ref115"><label>115</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lopardo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Precioso</surname><given-names>F</given-names> </name><name name-style="western"><surname>Garreau</surname><given-names>D</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ruslan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zico</surname><given-names>K</given-names> </name><name name-style="western"><surname>Katherine</surname><given-names>H</given-names> </name><name name-style="western"><surname>Adrian</surname><given-names>W</given-names> </name><name name-style="western"><surname>Nuria</surname><given-names>O</given-names> </name><name name-style="western"><surname>Jonathan</surname><given-names>S</given-names> </name></person-group><article-title>Attention meets post-hoc interpretability: a mathematical perspective</article-title><source>PMLR</source><year>2024</year><access-date>2026-05-21</access-date><volume>235</volume><fpage>32781</fpage><lpage>32800</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v235/lopardo24a.html">https://proceedings.mlr.press/v235/lopardo24a.html</ext-link></comment></nlm-citation></ref><ref id="ref116"><label>116</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bibal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cardon</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alfter</surname><given-names>D</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Fran&#x00E7;ois</surname><given-names>T</given-names> </name></person-group><article-title>Is attention explanation? An introduction to the debate</article-title><source>Proc 60th Ann Meeting Assoc Comput Linguist (Vol 1)</source><year>2022</year><fpage>3889</fpage><lpage>3900</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.269</pub-id></nlm-citation></ref><ref id="ref117"><label>117</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tracey</surname><given-names>B</given-names> </name><name name-style="western"><surname>Volfson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Towards interpretable speech biomarkers: exploring MFCCs</article-title><source>Sci Rep</source><year>2023</year><month>12</month><day>21</day><volume>13</volume><issue>1</issue><fpage>22787</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-49352-2</pub-id><pub-id pub-id-type="medline">38123603</pub-id></nlm-citation></ref><ref id="ref118"><label>118</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Jiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Berisha</surname><given-names>V</given-names> </name><name name-style="western"><surname>Liss</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Jiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Berisha</surname><given-names>V</given-names> </name><name name-style="western"><surname>Liss</surname><given-names>J</given-names> </name></person-group><article-title>Interpretable phonological features for clinical applications</article-title><source>2017 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)</source><year>2017</year><publisher-name>IEEE</publisher-name><fpage>5045</fpage><lpage>5049</lpage><pub-id pub-id-type="doi">10.1109/ICASSP.2017.7953117</pub-id></nlm-citation></ref><ref id="ref119"><label>119</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aranovich</surname><given-names>T de C</given-names> </name><name name-style="western"><surname>Matulionyte</surname><given-names>R</given-names> </name></person-group><article-title>Ensuring AI explainability in healthcare: problems and possible policy solutions</article-title><source>Inf Commun Technol Law</source><year>2023</year><month>05</month><day>4</day><volume>32</volume><issue>2</issue><fpage>259</fpage><lpage>275</lpage><pub-id pub-id-type="doi">10.1080/13600834.2022.2146395</pub-id></nlm-citation></ref><ref id="ref120"><label>120</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Moorthy</surname><given-names>UMK</given-names> </name><name name-style="western"><surname>Muthukumaran</surname><given-names>AMJ</given-names> </name><name name-style="western"><surname>Kaliyaperumal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Jayakumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vijayaraghavan</surname><given-names>KA</given-names> </name></person-group><article-title>Explainability and regulatory compliance in healthcare</article-title><source>Explainable Artificial Intelligence in the Healthcare Industry</source><year>2025</year><publisher-name>John Wiley &#x0026; Sons, Ltd</publisher-name><fpage>521</fpage><lpage>561</lpage><pub-id pub-id-type="doi">10.1002/9781394249312</pub-id></nlm-citation></ref><ref id="ref121"><label>121</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gomez</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Unberath</surname><given-names>M</given-names> </name></person-group><article-title>Explainable medical imaging AI needs human-centered design: guidelines and evidence from a systematic review</article-title><source>NPJ Digit Med</source><year>2022</year><month>10</month><day>19</day><volume>5</volume><issue>1</issue><fpage>156</fpage><pub-id pub-id-type="doi">10.1038/s41746-022-00699-2</pub-id><pub-id pub-id-type="medline">36261476</pub-id></nlm-citation></ref><ref id="ref122"><label>122</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wullenweber</surname><given-names>A</given-names> </name><name name-style="western"><surname>Akman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schuller</surname><given-names>BW</given-names> </name></person-group><article-title>CoughLIME: sonified explanations for the predictions of COVID-19 cough classifiers</article-title><source>2022 44th Annual International Conference of the IEEE Engineering in Medicine &#x0026; Biology Society (EMBC)</source><year>2022</year><publisher-name>IEEE</publisher-name><fpage>1342</fpage><lpage>1345</lpage><pub-id pub-id-type="doi">10.1109/EMBC48229.2022.9871291</pub-id></nlm-citation></ref><ref id="ref123"><label>123</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>LL</given-names> </name><name name-style="western"><surname>Zeghidour</surname><given-names>N</given-names> </name><name name-style="western"><surname>Saeed</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>X</surname><given-names>XO</given-names> </name><name name-style="western"><surname>C</surname><given-names>E</given-names> </name><name name-style="western"><surname>S</surname><given-names>P</given-names> </name><name name-style="western"><surname>G</surname><given-names>W</given-names> </name><name name-style="western"><surname>T</surname><given-names>S</given-names> </name><name name-style="western"><surname>A</surname><given-names>M</given-names> </name></person-group><article-title>CaReAQA: a cardiac and respiratory audio question answering model for open-ended diagnostic reasoning</article-title><source>PMLR</source><year>2025</year><access-date>2026-05-21</access-date><volume>287</volume><fpage>231</fpage><lpage>246</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v287/wang25b.html">https://proceedings.mlr.press/v287/wang25b.html</ext-link></comment></nlm-citation></ref><ref id="ref124"><label>124</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ma</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chng</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name></person-group><article-title>Audio-cot: exploring chain-of-thought reasoning in large audio language model</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 13, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.07246</pub-id></nlm-citation></ref><ref id="ref125"><label>125</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Buck</surname><given-names>A</given-names> </name><name name-style="western"><surname>Cosma</surname><given-names>G</given-names> </name><name name-style="western"><surname>Phillips</surname><given-names>I</given-names> </name><name name-style="western"><surname>Conway</surname><given-names>P</given-names> </name><name name-style="western"><surname>Baker</surname><given-names>P</given-names> </name></person-group><article-title>A framework for evaluating faithfulness in explainable AI for machine anomalous sound detection using frequency-band perturbation</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 26, 2026</comment><pub-id pub-id-type="doi">10.48550/arXiv.2601.19017</pub-id></nlm-citation></ref><ref id="ref126"><label>126</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Pizzimenti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kalia</surname><given-names>A</given-names> </name><name name-style="western"><surname>Toghranegar</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Consensus-based definitions for VOCAL biomarkers: the international VOCAL initiative</article-title><source>medRxiv</source><comment>Preprint posted online on  Dec 3, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.10.23.25338518</pub-id><pub-id pub-id-type="medline">41404273</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Database search queries.</p><media xlink:href="jmir_v28i1e83790_app1.docx" xlink:title="DOCX File, 20 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>PROBAST+AI risk assessment.</p><media xlink:href="jmir_v28i1e83790_app2.docx" xlink:title="DOCX File, 42 KB"/></supplementary-material><supplementary-material id="app3"><label>Checklist 1</label><p>PRISMA checklist.</p><media xlink:href="jmir_v28i1e83790_app3.pdf" xlink:title="PDF File, 206 KB"/></supplementary-material></app-group></back></article>