<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e77110</article-id><article-id pub-id-type="doi">10.2196/77110</article-id><article-categories><subj-group subj-group-type="heading"><subject>Review</subject></subj-group></article-categories><title-group><article-title>Critical Appraisal Tools for Evaluating Artificial Intelligence in Clinical Studies: Scoping Review</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Cabello</surname><given-names>Juan B</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ruiz Garcia</surname><given-names>Vicente</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Torralba</surname><given-names>Miguel</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maldonado Fernandez</surname><given-names>Miguel</given-names></name><degrees>MSc, MPH, MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ubeda</surname><given-names>Marimar</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ansuategui</surname><given-names>Eukene</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ramos-Ruperto</surname><given-names>Luis</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Emparanza</surname><given-names>Jose I</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Urreta</surname><given-names>Iratxe</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Iglesias</surname><given-names>Maria Teresa</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pijoan</surname><given-names>Jose I</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Burls</surname><given-names>Amanda</given-names></name><degrees>BA, MBBS, MSc</degrees><xref ref-type="aff" rid="aff10">10</xref></contrib></contrib-group><aff id="aff1"><institution>Critical Appraisal Skills Program Spain</institution><addr-line>C/ Enriqueta Elizaizin, 2, E 5, 7C</addr-line><addr-line>Alicante</addr-line><country>Spain</country></aff><aff id="aff2"><institution>Unidad de Hospitalizaci&#x00F3;n a Domicilio, Hospital Universitari i Polit&#x00E8;cnic La Fe</institution><addr-line>Valencia</addr-line><country>Spain</country></aff><aff id="aff3"><institution>Servicio de Medicina Interna, Hospital Universitario de Guadalajara</institution><addr-line>Guadalajara</addr-line><country>Spain</country></aff><aff id="aff4"><institution>Department of ENT, Hospital Vital Alvarez Buylla</institution><addr-line>Mieres</addr-line><country>Spain</country></aff><aff id="aff5"><institution>Hospital Donostia</institution><addr-line>Donostia - San Sebastian</addr-line><country>Spain</country></aff><aff id="aff6"><institution>Biblioteca virtual de salud de Euskadi</institution><addr-line>Vitoria</addr-line><country>Spain</country></aff><aff id="aff7"><institution>Unidad de VIH, Medicina Interna, Hospital Universitario La Paz</institution><addr-line>Madrid</addr-line><country>Spain</country></aff><aff id="aff8"><institution>Unidad de Epidemiologia Cl&#x00ED;nica e Investigaci&#x00F3;n, CIBER-SP, Hospital Universitario Donostia</institution><addr-line>San Sebastian</addr-line><country>Spain</country></aff><aff id="aff9"><institution>Instituto de Investigaci&#x00F3;n Sanitaria Biobizkaia-Hospital Universitario Cruces, Bizkaia</institution><addr-line>Baracaldo</addr-line><country>Spain</country></aff><aff id="aff10"><institution>City St George's, University of London</institution><addr-line>London</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Bhagat</surname><given-names>Chinmaya</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Cuch&#x00ED;</surname><given-names>Gerard Urrutia</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Juan B Cabello, MD, PhD, Critical Appraisal Skills Program Spain, C/ Enriqueta Elizaizin, 2, E 5, 7C, Alicante, 03007, Spain, 34 619669243; <email>jbcabello@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>8</day><month>12</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e77110</elocation-id><history><date date-type="received"><day>07</day><month>05</month><year>2025</year></date><date date-type="rev-recd"><day>13</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>14</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Juan B Cabello, Vicente Ruiz Garcia, Miguel Torralba, Miguel Maldonado Fernandez, Marimar Ubeda, Eukene Ansuategui, Luis Ramos-Ruperto, Jose I Emparanza, Iratxe Urreta, Maria Teresa Iglesias, Jose I Pijoan, Amanda Burls. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 8.12.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e77110"/><abstract><sec><title>Background</title><p>Health research that uses predictive and generative artificial intelligence (AI) is rapidly growing. As in traditional clinical studies, the way in which AI studies are conducted can introduce systematic errors. The translation of this AI evidence into clinical practice and research needs critical appraisal tools for clinical decision-makers and researchers.</p></sec><sec><title>Objective</title><p>This study aimed to identify existing tools for the critical appraisal of clinical studies that use AI and to examine the concepts and domains these tools explore. The research question was framed using the Population-Concept-Context (PCC) framework. Population (P): AI clinical studies; Concept (C): tools for critical appraisal and associated constructs such as quality, reporting, validity, risk of bias, and applicability; and context (C): clinical practice. In addition, studies on bias classification and chatbot assessment were included.</p></sec><sec sec-type="methods"><title>Methods</title><p>We searched medical and engineering databases (MEDLINE, Embase, CINAHL, PsycINFO, and IEEE) from inception to April 2024. We included clinical primary research with tools for critical appraisal. Classical reviews and systematic reviews were included in the first phase of screening and excluded in the secondary phase after identifying new tools by forward snowballing. We excluded nonhuman, computer, and mathematical research, and letters, opinion papers, and editorials. We used Rayyan (Qatar Computing Research Institute) for screening. Data extraction was done by two reviewers, and discrepancies were resolved through discussion. The protocol was previously registered in Open Science Framework. We adhered to the PRISMA-ScR (Preferred Reporting Items for Systematic reviews and Meta-Analyses extension for Scoping Reviews) and the PRISMA-S (PRISMA-Search) extension for reporting literature in systematic reviews.</p></sec><sec sec-type="results"><title>Results</title><p>We retrieved 4393 unique records for screening. After excluding 3803 records, 119 were selected for full-text screening. From these, 59 were excluded. After inclusion of 10 studies via other methods, a total of 70 records were finally included. We found 46 tools (26 guides for reporting AI studies, 16 tools for critical appraisal, 2 for study quality, and 2 for risk of bias). Nine papers focused on bias classification or mitigation. We found 15 chatbot assessment studies or systematic reviews of chatbot studies (6 and 9, respectively), which are a very heterogeneous group.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The results picture a landscape of evidence tools where reporting tools predominate, followed by critical appraisal, and a few tools for risk of bias. The mismatch of bias in AI and epidemiology should be considered for critical appraisal, especially regarding fairness and bias mitigation in AI. Finally, chatbot assessment studies represent a vast and evolving field in which progress in design, reporting, and critical appraisal is necessary and urgent.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>critical appraisal tools</kwd><kwd>scoping review</kwd><kwd>reporting guides</kwd><kwd>risk of bias</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Much clinical research is unreliable because of systematic errors in the way the study was conducted or because the research findings are not generalizable to the context in which a decision is being made. When presented with research findings, it is very important, therefore, that clinicians and policymakers can assess the certainty of the evidence&#x2014;that is, the level of confidence they can have that the estimated effect from a study or studies can be relied upon to support a particular decision or recommendation [<xref ref-type="bibr" rid="ref1">1</xref>].</p><p>To help decision-makers decide whether research is trustworthy and applicable to their context, tools and checklists have been developed to critically appraise the validity, results, and relevance of clinical and health care studies. There are many different tools that are adapted for different study designs. In addition to critical appraisal tools, there are also tools to guide the reporting of studies, ensuring that all relevant information is transparently and accurately included in the &#x201C;Methods&#x201D; and &#x201C;Results&#x201D; sections. Many examples of reporting tools are provided on the EQUATOR Network Website.</p><p>As new technologies develop and study designs evolve, there is a need to update and develop new critical appraisal tools to look for potential biases and flaws in these designs and to ensure that there is guidance on how such studies should be reported transparently and fully.</p><p>The exponential growth of the use of artificial intelligence (AI) is among the most important innovations in health care and clinical studies design. The term &#x201C;artificial intelligence&#x201D; was coined in 1956 to refer to the activity of machines to mimic human intelligence or behavior [<xref ref-type="bibr" rid="ref2">2</xref>]. Today, AI in health care encompasses a wide range of technologies and methods [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>There are many types and definitions of AI, and these are expanding all the time. One broad categorization is generative versus predictive AI: the former creates new content, while the latter analyzes data to make predictions. Both are used in health care. Other common classifications of AI are outlined below. The different types of AI are not mutually exclusive but overlap and build upon one another.</p><p>Classic AI is a simple rule-based system with a defined structure that is programmed and does not learn.</p><p>Machine learning (ML) allows computers to learn from data and perform tasks without being explicitly programmed, improving with exposure to additional data.</p><p>Deep learning is a type of ML that uses multilayer algorithms to create an artificial neural network that can learn and make intelligent decisions on its own.</p><p>Artificial vision or computer vision uses algorithms that enable machines to capture, process, analyze, and interpret digital images and video.</p><p>Natural language processing is a type of ML that enables computers to understand and communicate with human language. It is used, for example, by chatbots (computer programs that simulate conversation with human end users).</p><p>Large language models (LLMs) are a further development of natural language processing that trains on large datasets to generate rather than analyze text. These form the basis of applications such as ChatGPT, launched by OpenAI in November 2022.</p><p>All these approaches are being used in clinical and health care settings, for example, to make diagnoses [<xref ref-type="bibr" rid="ref5">5</xref>], identify cancers on imaging [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], assess prognosis [<xref ref-type="bibr" rid="ref8">8</xref>], develop and test treatments, and create a diverse ecosystem of chatbots [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref9">9</xref>] that are currently a promising cutting edge in health care.</p><p>Just as traditional health research can have systematic errors that lead to biased or nongeneralizable results, so AI methods can introduce their own systematic errors during the design, data collection, training, or evaluation stages, which threaten the validity and reliability of AI models&#x2019; data analysis, findings, and conclusions. Such errors can arise from several different sources, including, but not limited to, flawed data, biased algorithms, and incorrect training.</p><p>Health care decision-makers, therefore, need to be able to critically appraise AI studies to detect these problems to be able to assess the certainty and relevance of the evidence they produce. Consequently, there is interest in the creation of new specific instruments or the adaptation of classic ones for the critical appraisal of AI studies [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>The purpose of this paper was to undertake a scoping review to identify existing tools for critical appraisal of AI clinical studies and describe the concepts these tools address. We see this as the important first step toward being able to develop, evaluate, and recommend tools for future use.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>This scoping review was designed and conducted according to the methodological framework of Levac et al [<xref ref-type="bibr" rid="ref12">12</xref>] and the Joanna Briggs Institute (JBI [<xref ref-type="bibr" rid="ref13">13</xref>]). We follow the PRISMA-ScR (Preferred Reporting Items for Systematic reviews and Meta-Analyses extension for Scoping Reviews [<xref ref-type="bibr" rid="ref14">14</xref>]) and PRISMA-S (PRISMA-Search) for reporting literature searches in systematic reviews [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>We used the PCC framework for scoping reviews [<xref ref-type="bibr" rid="ref13">13</xref>]. Definitions of each element are given below under the review question.</p><p>The protocol was registered on the Open Science Framework on April 18, 2024 [<xref ref-type="bibr" rid="ref16">16</xref>]. Amendments to the protocol are documented in this paper and in the protocol.</p></sec><sec id="s2-2"><title>PCC Definitions</title><sec id="s2-2-1"><title>Population</title><p>Existing tools to assess AI clinical studies. We included any type of study design and any clinical objective: diagnostic, prognostic, prediction rules, or decision-making systems. We included both predictive and generative AI.</p></sec><sec id="s2-2-2"><title>Concepts</title><p>Studies describing tools for critical appraisal and associated constructs (completeness of reporting, validity of study, quality of study, risk of bias, and applicability), whether or not they had been formally evaluated. Modifications or adaptations of original tools were accepted. Studies focusing on a comprehensive approach to bias in AI and fairness, understood as the bioethical consequences of bias in AI clinical studies, were also included. Chatbot assessment studies, including primary research and systematic reviews, were included if they focused on clinical activities (diagnosis, prognosis, treatment, prevention, recommendations, or clinical decisions) and did not meet exclusion criteria (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Inclusion and exclusion criteria.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Inclusion criteria</td><td align="left" valign="bottom">Exclusion criteria</td></tr></thead><tbody><tr><td align="left" valign="top">Any type of study, describing tools for critical appraisal and associated constructs with any design and any clinical objective: diagnostic, prognostic, prediction rules, or decision-making systems. Studies focused on clinicians and clinical centers and clinical activities both in and out of hospitals. Published protocols were accepted.</td><td align="left" valign="top">Studies on animals, nonhuman studies (specimens), studies focused on engineering, development of models, algorithms, or analysis of their mathematical properties, as well as artificial intelligence (AI) studies aimed at increasing image resolution or anatomical amplification, virtual reality, or simulations.</td></tr><tr><td align="left" valign="top">We included both predictive and generative AI.</td><td align="left" valign="top">Letters to the editor and opinion papers. Editorials were excluded, except if they included guidelines for reporting or reading AI studies. Experimental studies were excluded.</td></tr><tr><td align="left" valign="top">Studies focusing on a comprehensive approach to bias in AI and fairness, understood as the bioethical consequences of bias in AI clinical studies.</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Chatbot assessment studies, primary research, and systematic reviews focused on clinical activities (diagnosis, prognosis, treatment, prevention, recommendations, or clinical decisions).</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">For chatbot studies only, we accepted lists of questions, clinical scenarios, or vignettes used in initial chatbot performance assessments. We were flexible in these studies.</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Classical reviews, systematic reviews, and congress abstracts describing or using AI critical appraisal tools were all included in the initial screening. Those that focused on AI biases or bias mitigation were included. The others were reviewed to identify any AI tools used. If they used AI tools, these tools were included in the review, but the systematic review itself was excluded.</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not available.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2-3"><title>Context</title><p>We focused on clinicians and clinical centers, and clinical activities both in and out of hospitals. Other clinical research or paraclinical areas were not included.</p></sec></sec><sec id="s2-3"><title>Review Question</title><p>Three questions are addressed in this review:</p><sec id="s2-3-1"><title>Primary Question</title><p>What tools exist for critical appraisal of studies on AI in the clinical setting, and what constructs do they address (relevance of the question, completeness of reporting, validity of study, quality of study, risk of bias, and applicability)? After reflection and discussion within the group, we made an amendment to the protocol to change the specified focus on critical appraisal, since this encompassed all the associated constructs and was more suitable for the clinical setting (the quality of the study or risk of bias being more specific and more relevant for systematic reviews).</p></sec><sec id="s2-3-2"><title>Subquestions</title><p>We anticipated 2 additional questions that would provide a more comprehensive evidence map of AI critical appraisal tools.</p><list list-type="order"><list-item><p>Concepts: all the above-mentioned constructs need to be adapted for the AI context, as AI studies may have different biases compared with classical epidemiological studies. Therefore, we sought to identify papers that focused on comprehensive reflections, catalogs, or glossaries of bias classification or bias mitigation in AI clinical studies.</p></list-item><list-item><p>Population: Because the upsurge in chatbot assessment studies in clinical research is so recent, we thought it was unlikely that we would find specific tools to assess their quality. Therefore, we looked at how the risk of bias is assessed in systematic reviews of these studies.</p></list-item></list></sec></sec><sec id="s2-4"><title>Search Strategy</title><p>We searched the following electronic databases from inception to April 2024: MEDLINE, Embase, CINAHL, PsycINFO, and IEEE Advancing Technology for Humanity. Two scientific information specialists undertook the searches independently. Their results were compared, and the search was refined. The terms used were as follows:</p><p>(&#x201C;artificial intelligence&#x201D; OR &#x201C;machine learning&#x201D; OR &#x201C;deep learning&#x201D; OR &#x201C;large language model&#x201D; OR &#x201C;computer vision&#x201D; OR &#x201C;artificial intelligence Chatbot&#x201D; OR &#x201C;ChatGPT&#x201D;) AND (&#x201C;risk assessment&#x201D; OR &#x201C;Bias&#x201D; OR &#x201C;quality assessment&#x201D; OR &#x201C;statistical bias&#x201D; OR &#x201C;reproducibility&#x201D; OR &#x201C;internal validity&#x201D; OR &#x201C;external validity&#x201D; OR &#x201C;critical appraisal&#x201D; OR &#x201C;reporting guideline&#x201D; OR &#x201C;checklist&#x201D; OR &#x201C;toolkit&#x201D; OR &#x201C;tools&#x201D;). No language limitations were used.</p><p>We searched the following registries: PROSPERO [<xref ref-type="bibr" rid="ref17">17</xref>], Open Science Framework [<xref ref-type="bibr" rid="ref18">18</xref>], and the Research Registry [<xref ref-type="bibr" rid="ref19">19</xref>].</p><p>We searched the EQUATOR Network [<xref ref-type="bibr" rid="ref20">20</xref>] for reporting guidelines using the terms &#x201C;artificial intelligence&#x201D; OR &#x201C;machine learning&#x201D; OR &#x201C;deep learning&#x201D; AND &#x201C;reporting guidelines.&#x201D;</p><p>We tracked citations from the systematic reviews of tools identified in the first phase of screening (snowballing). Finally, we incorporated some papers recommended by experts.</p><p>We used Zotero (Sean Takats) as the main tool for managing references. A complete description of the process, including search dates, is available in <xref ref-type="supplementary-material" rid="app5">Checklist 1</xref> (checklist of PRISMA Searching).</p><p>For chatbot assessment studies, we used a free-text&#x2013;based strategy using synonyms and truncations, because these are not yet Medical Subject Headings (MeSH) terms, so controlled language could not be used.</p></sec><sec id="s2-5"><title>Source of Evidence Selection</title><p>All searches were merged into a file and exported to Rayyan for screening. Duplicate documents identified by Rayyan were reviewed by an information specialist, and duplicates were removed.</p><sec id="s2-5-1"><title>First Phase (Screening by Title and Abstract)</title><p>We divided the retrieved papers into three randomized samples. Three groups of two researchers rated their allotted samples independently and in a blinded way. Disagreements flagged by Rayyan were resolved by discussion and consensus within each group session first, and then in a general session among groups. Two facilitators, not involved in the initial ratings, took part in all discussions (both within and among groups) to resolve disagreements and ensure consistent criteria across groups.</p></sec><sec id="s2-5-2"><title>Second Phase (Full-Text Screening)</title><p>The selected set of references was rated in Rayyan by 2 groups of researchers working independently. Inconsistencies (between and within groups) were identified and resolved by discussion and consensus in a common session with the help of 2 facilitators. The exclusions during full-text screening and their reasons were recorded.</p><p>The AI tools identified from systematic review papers were included in &#x201C;Identification via Other methods&#x201D; (citation searching from systematic review of AI studies). These studies, and those obtained from the EQUATOR Network library and experts and organizations, were cross-referenced with the studies remaining after full-text screening for duplicates.</p></sec></sec><sec id="s2-6"><title>Data Extraction</title><sec id="s2-6-1"><title>Main Question</title><p>We constructed and piloted a data template, informed by JBI [<xref ref-type="bibr" rid="ref21">21</xref>], which included editorial data such as author, year, associated domains, main question, and associated constructs, as well as other features such as clinical use, practical conditions, object of the tool, methodological characteristics, number of items, and method for developing the tool.</p><p>The first version of the template was tested by 2 researchers on a set of 10 included papers. The data template was refined, when necessary, in an iterative process. After modifications, the final version of the template was piloted on another set of 10 included papers. The final version of the template is available in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>The data were entered into Excel (Microsoft) independently by 2 researchers. Data inconsistencies were identified and resolved by discussion and consensus with a third reviewer.</p></sec><sec id="s2-6-2"><title>Subquestions</title><p>For bias and bias mitigation, the following data were extracted independently by 2 researchers: author date, title, bias classification, bias mitigation, and free comments. The consistency of data and qualitative details was discussed, and agreement was reached by consensus with a third rater.</p><p>For chatbot assessment studies, a template was designed, piloted, and modified. The final version (refer to <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) included author, year, topic, title, PICO (Population/Patient/Problem, Intervention, Comparison, and Outcome), risk-of-bias tools used, and open comments.</p><p>Data extraction for chatbot studies used a hybrid approach, combining the active involvement of a researcher with a fully supervised ChatGPT&#x2013;retrieval-augmented generation model. This strategy was adopted given the predictable heterogeneity of chatbot interventions, with the aim of enhancing the clarity and reproducibility of extracted data. ChatGPT-4o was used to assist in drafting and refining the extraction tables, but all outputs were independently reviewed by 2 authors against the original papers. Discrepancies were reviewed and resolved through consensus. No sensitive data were exposed. To promote transparency and reproducibility, the exact prompts used in the retrieval-augmented generation process are shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref><underline>.</underline></p></sec></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Search Result</title><p>We identified 4392 records from databases and registries. After eliminating 470 duplicates, 3922 records were screened by title and abstract, and 3803 were excluded. The remaining 119 underwent full-text screening, and 59 were excluded. The reasons for exclusion were as follows: 50 were systematic reviews, 7 studies met the exclusion criteria, and 2 did not meet the inclusion criteria. Full details are available in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref> (exclusions after full-text screening). Of the 50 systematic reviews, 42 used specific AI tools to assess the quality of the studies, and the tools retrieved were incorporated into &#x201C;records identified via other methods.&#x201D;</p><p>Twelve studies were identified in the EQUATOR Network library, and 4 additional studies were obtained from experts and organizations; therefore, there were 58 records identified via other methods. Forty-eight of these were already captured in the 60 included studies from the search of electronic databases, leaving 10 additional studies to be included. Thus, a total of 70 studies were included in this review (refer to <xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>PRISMA (Preferred Reporting Items for Systematic reviews and Meta-Analyses) 2020 flow diagram adapted for scoping reviews.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e77110_fig01.png"/></fig></sec><sec id="s3-2"><title>Characteristics of Included Studies</title><p>Of the 70 retrieved studies, 46 focused on the main question of the review: tools for critical appraisal and related constructs. The general characteristics of these studies are shown in <xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="table" rid="table2">Table 2</xref>. Nine papers were relevant to our second question and focusing on AI bias classification or bias mitigation (refer to <xref ref-type="table" rid="table3">Table 3</xref>). We found 15 chatbot assessment studies (6 were primary research studies and 9 were systematic reviews). The main characteristics of both types are shown in <xref ref-type="table" rid="table4">Table 4</xref> and <xref ref-type="table" rid="table5">Table 5</xref>, respectively.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Tools for critical appraisal and related constructs.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Author, year</td><td align="left" valign="bottom">Name of tool</td><td align="left" valign="bottom">Clinical aim</td><td align="left" valign="bottom">Clinical area or specialty</td><td align="left" valign="bottom">No of items</td><td align="left" valign="bottom">Setting/context of use</td><td align="left" valign="bottom">Construct</td><td align="left" valign="bottom">Design</td></tr></thead><tbody><tr><td align="left" valign="top">Luo et al, 2016 [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">Luo</td><td align="left" valign="top">CA/ML/AI<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">56</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Partially collaborative</td></tr><tr><td align="left" valign="top">Lambin et al, 2017 [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">Radiomics Quality Score (RQS)</td><td align="left" valign="top">Diagnosis/progn/treatment</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">36</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Qiao, 2019 [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">Qiao</td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">20</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Liu et al, 2019 [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">Liu</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">3</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Vollmer et al, 2019 [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">TREE</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">20</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Partially collaborative</td></tr><tr><td align="left" valign="top">Cruz Rivera et al, 2020 [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">SPIRIT-AI<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">Treatment</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">15</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Faes et al, 2020 [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">Faes</td><td align="left" valign="top">Diagnosis/Progn/Treat</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top"/><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Hernandez-Boussard et al, 2020 [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">MINIMAR<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">21</td><td align="left" valign="top">Social (identifying or mitigating algorithmic bias)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Liu et al, 2020 [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">CONSORT-AI<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">Treatment</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">13</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Mongan et al, 2020 [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">CLAIM<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">42</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Partially collaborative</td></tr><tr><td align="left" valign="top">Norgeot et al, 2020 [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">MI-CLAIM<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">22</td><td align="left" valign="top">Social (identifying or mitigating algorithmic bias)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Sengupta et al, 2020 [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">PRIME<sup><xref ref-type="table-fn" rid="table2fn7">g</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">28</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Stevens et al, 2020 [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">Stevens</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top"/><td align="left" valign="top">Social (identifying or mitigating algorithmic bias)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Cabitza and Campagner, 2021 [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">IJMEDI checklist</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">30</td><td align="left" valign="top">Preclinical and clinical studies:</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">El Naqua et al, 2021 [<xref ref-type="bibr" rid="ref36">36</xref>]</td><td align="left" valign="top">CLAMP</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Radiation oncology</td><td align="left" valign="top">26</td><td align="left" valign="top">Preclinical and clinical studies:</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Kwong et al, 2021 [<xref ref-type="bibr" rid="ref37">37</xref>]</td><td align="left" valign="top">STREAM-URO<sup><xref ref-type="table-fn" rid="table2fn8">h</xref></sup></td><td align="left" valign="top">Prognosis</td><td align="left" valign="top">Urology</td><td align="left" valign="top">26</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Meshaka et al, 2021 [<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">CLAIM-Pediatrics Rx</td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">42</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Olczak et al, 2021 [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">CAIR<sup><xref ref-type="table-fn" rid="table2fn9">i</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Traumatology</td><td align="left" valign="top">36</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Schwendicke et al, 2021 [<xref ref-type="bibr" rid="ref40">40</xref>]</td><td align="left" valign="top">Schwendicke</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Oral health</td><td align="left" valign="top">31</td><td align="left" valign="top">Preclinical studies (ML)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Sounderajah et al, 2021 [<xref ref-type="bibr" rid="ref41">41</xref>]</td><td align="left" valign="top">QUADAS-AI<sup><xref ref-type="table-fn" rid="table2fn10">j</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Preclinical studies (ML)</td><td align="left" valign="top">Risk of bias</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Sounderajah et al, 2021 [<xref ref-type="bibr" rid="ref42">42</xref>]</td><td align="left" valign="top">STARD-AI<sup><xref ref-type="table-fn" rid="table2fn11">k</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive design</td></tr><tr><td align="left" valign="top">Vinny et al, 2021 [<xref ref-type="bibr" rid="ref43">43</xref>]</td><td align="left" valign="top">Vinny</td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">14</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Collins et al, 2021 [<xref ref-type="bibr" rid="ref44">44</xref>]</td><td align="left" valign="top">PROBAST-AI<sup><xref ref-type="table-fn" rid="table2fn12">l</xref></sup></td><td align="left" valign="top">Prognosis</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Risk of bias</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Al-Zaiti et al, 2022 [<xref ref-type="bibr" rid="ref45">45</xref>]</td><td align="left" valign="top">ROBUST-ML<sup><xref ref-type="table-fn" rid="table2fn13">m</xref></sup></td><td align="left" valign="top">Diagnosis/Progn/Treat</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">30</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Daneshjou et al, 2022 [<xref ref-type="bibr" rid="ref46">46</xref>]</td><td align="left" valign="top">CLEAR/DERM<sup><xref ref-type="table-fn" rid="table2fn14">n</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Dermatology</td><td align="left" valign="top">25</td><td align="left" valign="top">Clinical use of diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Haller et al, 2022 [<xref ref-type="bibr" rid="ref47">47</xref>]</td><td align="left" valign="top">R-AI-DIOLOGY</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">15</td><td align="left" valign="top">Preclinical and clinical studies</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Jha et al, 2022 [<xref ref-type="bibr" rid="ref48">48</xref>]</td><td align="left" valign="top">RELIANCE<sup><xref ref-type="table-fn" rid="table2fn15">o</xref></sup> (CLAIM)</td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Comprehensive design</td></tr><tr><td align="left" valign="top">Padula et al, 2022 [<xref ref-type="bibr" rid="ref49">49</xref>]</td><td align="left" valign="top">PALISADE<sup><xref ref-type="table-fn" rid="table2fn16">p</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">8</td><td align="left" valign="top">Social (identifying or mitigating algorithmic bias)</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Van Smeden et al, 2022 [<xref ref-type="bibr" rid="ref50">50</xref>]</td><td align="left" valign="top">Van-Smeden</td><td align="left" valign="top">Prognosis</td><td align="left" valign="top">Cardiology</td><td align="left" valign="top">12</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Vasey et al, 2022 [<xref ref-type="bibr" rid="ref51">51</xref>]</td><td align="left" valign="top">DECIDE AI<sup><xref ref-type="table-fn" rid="table2fn17">q</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">27</td><td align="left" valign="top">Clinical evaluation of decision support systems</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Jones et al, 2022 [<xref ref-type="bibr" rid="ref52">52</xref>]</td><td align="left" valign="top">Jones</td><td align="left" valign="top">Prognosis/diagnosis</td><td align="left" valign="top">Dermatology</td><td align="left" valign="top">19</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Cabello, 2022 [<xref ref-type="bibr" rid="ref53">53</xref>]</td><td align="left" valign="top">CASPE-AI<sup><xref ref-type="table-fn" rid="table2fn18">r</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">10</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Cacciamani et al, 2023 [<xref ref-type="bibr" rid="ref54">54</xref>]</td><td align="left" valign="top">PRISMA<sup><xref ref-type="table-fn" rid="table2fn19">s</xref></sup>-AI</td><td align="left" valign="top">Diagnosis/Progn/Treat</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Preclinical and clinical studies</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Alberich et al, 2023 [<xref ref-type="bibr" rid="ref55">55</xref>]</td><td align="left" valign="top">MAIC-10<sup><xref ref-type="table-fn" rid="table2fn20">t</xref></sup></td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">10</td><td align="left" valign="top">Social (identifying or mitigating algorithmic bias)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Partially collaborative</td></tr><tr><td align="left" valign="top">Kocak et al, 2023 [<xref ref-type="bibr" rid="ref56">56</xref>]</td><td align="left" valign="top">CLEAR<sup><xref ref-type="table-fn" rid="table2fn21">u</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">58</td><td align="left" valign="top">Preclinical studies (ML)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Kwong et al, 2023 [<xref ref-type="bibr" rid="ref57">57</xref>]</td><td align="left" valign="top">APPRAISE-AI</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">24</td><td align="left" valign="top">Preclinical and clinical studies</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Park et al, 2023 [<xref ref-type="bibr" rid="ref58">58</xref>]</td><td align="left" valign="top">Park</td><td align="left" valign="top">Diagnosis</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">10</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">FDA, 2023 [<xref ref-type="bibr" rid="ref59">59</xref>]</td><td align="left" valign="top">FDA<sup><xref ref-type="table-fn" rid="table2fn22">v</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">10</td><td align="left" valign="top">Other</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Collins et al, 2024 [<xref ref-type="bibr" rid="ref60">60</xref>]</td><td align="left" valign="top">TRIPOD-AI<sup><xref ref-type="table-fn" rid="table2fn23">w</xref></sup></td><td align="left" valign="top">Prognosis</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">52</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Du Toit et al, 2023 [<xref ref-type="bibr" rid="ref61">61</xref>]</td><td align="left" valign="top">HUMANE<sup><xref ref-type="table-fn" rid="table2fn24">x</xref></sup></td><td align="left" valign="top">Prognosis/Diagnosis</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">55</td><td align="left" valign="top">Clinical use of diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Partially Collaborative</td></tr><tr><td align="left" valign="top">Cote and Lubowitz, 2024 [<xref ref-type="bibr" rid="ref62">62</xref>]</td><td align="left" valign="top">Cote</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Traumatology</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Preclinical and clinical studies</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Kocak et al, 2024 [<xref ref-type="bibr" rid="ref63">63</xref>]</td><td align="left" valign="top">METRICS<sup><xref ref-type="table-fn" rid="table2fn25">y</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">30</td><td align="left" valign="top">Social (identifying or mitigating algorithmic bias)</td><td align="left" valign="top">Quality</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">Lekadir et al, 2024 [<xref ref-type="bibr" rid="ref64">64</xref>]</td><td align="left" valign="top">FUTURE-AI<sup><xref ref-type="table-fn" rid="table2fn26">z</xref></sup></td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Radiology</td><td align="left" valign="top">55</td><td align="left" valign="top">Preclinical studies (ML)</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Scott et al, 2024 [<xref ref-type="bibr" rid="ref65">65</xref>]</td><td align="left" valign="top">Scott</td><td align="left" valign="top">CA/ML/AI</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">10</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Critical appraisal</td><td align="left" valign="top">Experts</td></tr><tr><td align="left" valign="top">Vaira et al, 2024 [<xref ref-type="bibr" rid="ref66">66</xref>]</td><td align="left" valign="top">QUAMAI<sup><xref ref-type="table-fn" rid="table2fn27">aa</xref></sup> (ChatGPT; OpenAI)</td><td align="left" valign="top">Chatbot study</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Diagnosis/prognosis/treatment</td><td align="left" valign="top">Quality</td><td align="left" valign="top">Comprehensive</td></tr><tr><td align="left" valign="top">CHART Collabor, 2024 [<xref ref-type="bibr" rid="ref67">67</xref>]</td><td align="left" valign="top">CHART<sup><xref ref-type="table-fn" rid="table2fn28">ab</xref></sup> (chatbots)</td><td align="left" valign="top">Chatbot study</td><td align="left" valign="top">Clinical and research use</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">Preclinical and clinical studies:</td><td align="left" valign="top">Reporting</td><td align="left" valign="top">Partially collaborative</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AI: artificial intelligence.</p></fn><fn id="table2fn2"><p><sup>b</sup>SPIRIT-AI: Standard Protocol Items: Recommendations for Interventional Trials involving Artificial Intelligence.</p></fn><fn id="table2fn3"><p><sup>c</sup>MINIMAR: Minimum Information for Medical AI Reporting.</p></fn><fn id="table2fn4"><p><sup>d</sup>CONSORT-AI: Consolidated Standards of Reporting Trials extension for  Artificial Intelligence.</p></fn><fn id="table2fn5"><p><sup>e</sup>CLAIM: Checklist for Artificial Intelligence in Medical Imaging.</p></fn><fn id="table2fn6"><p><sup>f</sup>MI-CLAIM: Minimum Information for Medical Artificial Intelligence Reporting.</p></fn><fn id="table2fn7"><p><sup>g</sup>PRIME: Proposed Requirements for Cardiovascular Imaging Related MI Evaluation.</p></fn><fn id="table2fn8"><p><sup>h</sup>STREAM-URO: Standardized Reporting of Machine Learning Applications in Urology.</p></fn><fn id="table2fn9"><p><sup>i</sup>CAIR: Clinical Artificial Intelligence Research. </p></fn><fn id="table2fn10"><p><sup>j</sup>QUADAS-AI: Quality Assessment of Diagnostic Accuracy Studies for Artificial Intelligence.</p></fn><fn id="table2fn11"><p><sup>k</sup>STARD-AI: Standards for Reporting of Diagnostic Accuracy Studies for Artificial Intelligence.</p></fn><fn id="table2fn12"><p><sup>l</sup>PROBAST-AI: Prediction model Risk Of Bias Assessment Tool for AI studies.</p></fn><fn id="table2fn13"><p><sup>m</sup>ROBUST-ML: Ruling Out Bias Using Standard Tools in Machine Learning.</p></fn><fn id="table2fn14"><p><sup>n</sup>CLEARDERM: Checklist for Evaluation of Image-Based Artificial Intelligence (AI) Algorithm Reports in Dermatology.</p></fn><fn id="table2fn15"><p><sup>o</sup>RELAINCE (Recommendations for Evaluation of AI for Nuclear Medicine).</p></fn><fn id="table2fn16"><p><sup>p</sup>PALISADE: Purpose, Appropriateness, Limitations, Implementation, Sensitivity and Specificity, Algorithm characteristics, Data characteristics, and Explainability.</p></fn><fn id="table2fn17"><p><sup>q</sup>DECIDE AI: Developmental and Exploratory Clinical Investigations of Decision Support Systems Driven by Artificial Intelligence.</p></fn><fn id="table2fn18"><p><sup>r</sup>CASPE-AI: CRITICAL APPRAISAL SKILLS PROGRAM ESPA&#x00D1;A-Artificial Intelligence.</p></fn><fn id="table2fn19"><p><sup>s</sup>PRISMA: Preferred Reporting Items for Systematic reviews and Meta-Analyses.</p></fn><fn id="table2fn20"><p><sup>t</sup>MAIC-10 (Must AI Criteria-10).</p></fn><fn id="table2fn21"><p><sup>u</sup>CLEAR: CheckList for EvaluAtion of Radiomics research</p></fn><fn id="table2fn22"><p><sup>v</sup>FDA: Food and Drug Administration.</p></fn><fn id="table2fn23"><p><sup>w</sup>TRIPOD-AI: Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis using Artificial Intelligence.</p></fn><fn id="table2fn24"><p><sup>x</sup>HUMANE: Harmonious Understanding of Machine Learning Analytics Network.</p></fn><fn id="table2fn25"><p><sup>y</sup>METRICS: Methodological Radiomics Score.</p></fn><fn id="table2fn26"><p><sup>z</sup>FUTURE-AI: Fairness, Universality, Traceability, Usability, Robustness, and Explainability.</p></fn><fn id="table2fn27"><p><sup>aa</sup>QUAMAI: Quality Analysis of Medical Artificial Intelligence.</p></fn><fn id="table2fn28"><p><sup>ab</sup>CHART: Chatbot Assessment Reporting Tool.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Bias classification and bias mitigation papers.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Author</td><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Title</td><td align="left" valign="bottom">Bias classification</td><td align="left" valign="bottom">Bias mitigation</td><td align="left" valign="bottom">Comments</td></tr></thead><tbody><tr><td align="left" valign="top">Brault and Saxena [<xref ref-type="bibr" rid="ref68">68</xref>]</td><td align="left" valign="top">2021</td><td align="left" valign="top">For a critical appraisal of artificial intelligence in health care: the problem of bias in mHealth<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="top">Describes different steps where bias can be introduced during data collection, manipulation, or processing.</td><td align="left" valign="top">No information about this topic</td><td align="left" valign="top">Uses examples from contemporary use of mHealth apps</td></tr><tr><td align="left" valign="top">Feltcher et al [<xref ref-type="bibr" rid="ref69">69</xref>]</td><td align="left" valign="top">2021</td><td align="left" valign="top">Addressing fairness, bias, and appropriate use of artificial intelligence and ML<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup><break/>in global health</td><td align="left" valign="top">Systematic bias, sampling bias, and socioeconomic status bias.</td><td align="left" valign="top">Solutions to mitigate biases in all stages of algorithm development (sampling, regularization constraints, cost functions, and adversarial learning algorithms)</td><td align="left" valign="top">Uses an example of creating a model for diagnosing lung disease in primary care</td></tr><tr><td align="left" valign="top">Mehrabi et al<break/>[<xref ref-type="bibr" rid="ref70">70</xref>]</td><td align="left" valign="top">2022</td><td align="left" valign="top">A survey on bias and fairness in ML</td><td align="left" valign="top">Bias from data to algorithm (measurement bias, omitted variable bias, representation bias, aggregation bias, sampling bias, longitudinal data fallacy, and linking bias);<break/>Bias from algorithm to user (algorithmic bias, user interaction bias, popularity bias, emergent bias, and evaluation bias); and bias from user to data (historical bias, population bias, and self-selection bias).</td><td align="left" valign="top">Provides a synthesis of fairness definitions and a fair classification with a causal reflection about unfairness. In addition, it includes a comparison of different mitigation algorithms.</td><td align="left" valign="top">Includes datasets for fairness research.</td></tr><tr><td align="left" valign="top">Swartz et al<break/>[<xref ref-type="bibr" rid="ref71">71</xref>]</td><td align="left" valign="top">2022</td><td align="left" valign="top">Towards a standard for identifying and managing bias in artificial intelligence</td><td align="left" valign="top">Systemic, statistical, and human biases.</td><td align="left" valign="top">Outlines 3 major challenges to mitigating bias: datasets, testing and evaluation, and human factors.</td><td align="left" valign="top">Presents preliminary guidance to address bias</td></tr><tr><td align="left" valign="top">Xu et al<break/>[<xref ref-type="bibr" rid="ref72">72</xref>]</td><td align="left" valign="top">2022</td><td align="left" valign="top">Algorithmic fairness in computational medicine</td><td align="left" valign="top">Computational bias (selection bias, attrition bias, publication bias, measurement bias, and algorithm bias).</td><td align="left" valign="top">Mitigation at preprocessing (demonstration and reweighting), internal processing (debiasing and adversarial learning), and postprocessing (matched odds and calibrated matched odds)</td><td align="left" valign="top">Summarizes available software libraries and tools for bias assessment and mitigation</td></tr><tr><td align="left" valign="top">Saint James Aquino [<xref ref-type="bibr" rid="ref73">73</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Making decisions: bias in artificial intelligence and data-driven diagnostic tools</td><td align="left" valign="top">Algorithmic bias</td><td align="left" valign="top">No information about this topic</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Park and Hu [<xref ref-type="bibr" rid="ref74">74</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Bias in artificial intelligence</td><td align="left" valign="top">Bias in data generation (data collection or determination of results), bias in model training, testing, and validation (model selection or treatment of missing values), and bias in model interpretation and application (acceptance or health literacy).</td><td align="left" valign="top">Preprocessing (reweighting), internal processing (reducing influence of a variable in the learning process), and postprocessing (adjusting the results in a post hoc manner). Also, discuss nonalgorithmic bias mitigation such as patient demographic distribution between training data and target population.</td><td align="left" valign="top">Includes a figure with the stages of artificial intelligence application development and associated biases.</td></tr><tr><td align="left" valign="top">Perez-Downes et al [<xref ref-type="bibr" rid="ref75">75</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Mitigating bias in clinical ML models</td><td align="left" valign="top">Algorithmic bias</td><td align="left" valign="top">Mitigation across domains: inclusivity (ensuring women and racial/ethnic minority groups are adequately represented in training datasets), specificity (ensuring that appropriate and specific training targets are selected when developing models), transparency (ensuring standard reporting to include information regarding training data, model annotation, and interpretability), validation (conducting rigorous testing/auditing), validation studies (internal and external), and clinical trials as appropriate before deploying ML<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup> models for use in clinical care.</td><td align="left" valign="top">Includes a figure illustrating a framework for mitigating bias,<break/>a figure with ethical challenges in ML for clinical research and practice, and examples of current applications of ML in clinical medicine.</td></tr><tr><td align="left" valign="top">Flores et al [<xref ref-type="bibr" rid="ref76">76</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Addressing bias in artificial intelligence for public health surveillance</td><td align="left" valign="top">Algorithmic bias resulting from data collection, labeling, and modeling of natural language processing (NLP)</td><td align="left" valign="top">The implementation of open collaboration, auditing processes, and the development of guidelines.</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>mHealth: mobile health.</p></fn><fn id="table3fn2"><p><sup>b</sup>ML: machine learning.</p></fn><fn id="table3fn3"><p><sup>c</sup>Not available.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Chatbot assessment studies (primary research).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Author</td><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Topic</td><td align="left" valign="bottom">Population</td><td align="left" valign="bottom">Intervention</td><td align="left" valign="bottom">Gold standard<break/>/ Comparison</td><td align="left" valign="bottom">Outcome</td><td align="left" valign="bottom">Type of chatbots</td><td align="left" valign="bottom">Reporting</td></tr></thead><tbody><tr><td align="left" valign="top">Yeo et al [<xref ref-type="bibr" rid="ref77">77</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Assessment of ChatGPT&#x2019;s accuracy and consistency in answering questions</td><td align="left" valign="top">Set of questions related to cirrhosis and hepatocellular carcinoma (HCC).</td><td align="left" valign="top">ChatGPT responses</td><td align="left" valign="top">Compared to medical experts&#x2019; responses and guidelines.</td><td align="left" valign="top">Accuracy, consistency.<break/>ChatGPT showed good performance but lacked specificity in regional recommendations.</td><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> (GPT-3.5&#x2013;based chatbot trained until 2021).</td><td align="left" valign="top">Addressed hallucinations, reproducibility issues, and lack of localized recommendations.</td></tr><tr><td align="left" valign="top">Johnson et al [<xref ref-type="bibr" rid="ref78">78</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Evaluation of ChatGPT in answering clinical questions generated by clinician specialists.</td><td align="left" valign="top">A set of clinical questions generated by specialists.</td><td align="left" valign="top">ChatGPT generated answers for various medical difficulties.</td><td align="left" valign="top">Expert-established benchmarks and clinical standards.</td><td align="left" valign="top">Accuracy and completeness<break/>High accuracy for easy/moderate.</td><td align="left" valign="top">LLM (GPT-3.5&#x2013;based chatbot trained until 2021).</td><td align="left" valign="top">Addressed risks of authoritative-looking errors and ethical/privacy concerns in AI<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> medical tools.</td></tr><tr><td align="left" valign="top">Goh et al [<xref ref-type="bibr" rid="ref79">79</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Evaluation of ChatGPT (GPT-4) in clinical decision-making for chest pain cases.</td><td align="left" valign="top">Fifty clinicians were randomized to 2 different video clinical vignettes.</td><td align="left" valign="top">GPT-4 responses reviewed after initial physician answers; open interactions allowed.</td><td align="left" valign="top">Pre-LLM vs post-LLM responses were evaluated against clinical guidelines.</td><td align="left" valign="top">Accuracy and bias. Improvement (18%) in decision accuracy without increasing race/gender bias.</td><td align="left" valign="top">LLM (GPT-4) for recommendations and guideline discussions.</td><td align="left" valign="top">Discussed hallucinations, transparency, and the need for health care&#x2013;specific interfaces.</td></tr><tr><td align="left" valign="top">Hanna et al [<xref ref-type="bibr" rid="ref80">80</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Comparison of Bing AI&#x2019;s modes (Creative, Balanced, and Precise) for surgical nephrolithiasis questions.</td><td align="left" valign="top">Set of 20 questions on AUA<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> surgical stone management.</td><td align="left" valign="top">Three Bing AI modes: Creative, Balanced, and Precise. Responses were evaluated according to AUA guidelines.</td><td align="left" valign="top">Evaluation using the Brief DISCERN score.</td><td align="left" valign="top">Quality, empathy, and adherence to guidelines. Creative mode showed the highest appropriateness (85%).</td><td align="left" valign="top">LLM (Bing AI with Creative, Balanced, and Precise modes).</td><td align="left" valign="top">Noted 15% inappropriate response rate; emphasized need for caution and further studies.</td></tr><tr><td align="left" valign="top">Zakka et al [<xref ref-type="bibr" rid="ref81">81</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Evaluation of retrieval-augmented language models (Almanac); Almanac vs other LLMs.</td><td align="left" valign="top">Clinical questions included in ClinicalQA<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> (a benchmark of open-ended clinical questions).</td><td align="left" valign="top">Almanac used retrieval-based information for the accuracy of clinical answers.</td><td align="left" valign="top">Compared with ChatGPT-4o, Bing, and Bard.</td><td align="left" valign="top">Accurate clinical answers. Almanac performed better in factuality (91%), completeness, and adversarial safety (100%).</td><td align="left" valign="top">Retrieval-augmented LLMs integrating databases such as PubMed, UpToDate, and BMJ Best Practice.</td><td align="left" valign="top">Mentioned hallucination risks and emphasized rigorous testing before clinical implementation.</td></tr><tr><td align="left" valign="top">Huo et al [<xref ref-type="bibr" rid="ref82">82</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Analysis of LLM-based chatbots (ChatGPT, Bing, Bard, and Claude 2) in colorectal cancer.</td><td align="left" valign="top">Set of 9 clinical scenarios of colorectal cancer and screening.</td><td align="left" valign="top">Chatbots provided recommendations for screening, both for clinicians and lay patients.</td><td align="left" valign="top">Guidelines from USPSTF, CCS, USMSTF, and ACS; comparison between chatbots.</td><td align="left" valign="top">Accuracy and consistency across chatbots; ChatGPT was most accurate.</td><td align="left" valign="top">LLMs, including ChatGPT, Bing, Bard, and Claude 2.</td><td align="left" valign="top">Highlighted data quality variability and noted inconsistencies in patient vs clinician guidance.</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table4fn2"><p><sup>b</sup>AI: artificial intelligence.</p></fn><fn id="table4fn3"><p><sup>c</sup>AUA: American Urological Association.</p></fn><fn id="table4fn4"><p><sup>d</sup>Clinical QA: clinical question answering.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Systematic reviews of chatbot assessment studies.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Author</td><td align="left" valign="bottom">Year</td><td align="left" valign="bottom">Topic</td><td align="left" valign="bottom">Studies included</td><td align="left" valign="bottom">Intervention</td><td align="left" valign="bottom">Comparison</td><td align="left" valign="bottom">Risk of bias tool</td><td align="left" valign="bottom">Chatbot type</td><td align="left" valign="bottom">Reporting</td></tr></thead><tbody><tr><td align="left" valign="top">Geoghegan et al [<xref ref-type="bibr" rid="ref83">83</xref>]</td><td align="left" valign="top">2021</td><td align="left" valign="top">Focuses on postintervention follow-up in adults and adolescents.</td><td align="left" valign="top">10 studies: (3 RCT<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>, 6 cohort studies),<break/>5492 participants (range 9&#x2010;4737)</td><td align="left" valign="top">Chatbots are not trained in psychology, but are designed for symptom monitoring and providing support.</td><td align="left" valign="top">Phone calls, standard postoperative care.</td><td align="left" valign="top">Cochrane RoB-2<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup>, ROBINS-I<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup>, and NIH<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup> cohorts<bold>.</bold> Risks due to lack of blinding and heterogeneity.</td><td align="left" valign="top">Text- and voice-based; rule-based and mixed dialog; integrated with electronic medical records and mobile apps.</td><td align="left" valign="top">Used PRISMA<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup>. Recommends standardizing outcomes and implementation strategies.</td></tr><tr><td align="left" valign="top">Oh et al [<xref ref-type="bibr" rid="ref84">84</xref>]</td><td align="left" valign="top">2021</td><td align="left" valign="top">Focuses on weight loss and a healthy diet in adults and adolescents.</td><td align="left" valign="top">9 studies (4 RCT, 5 quasi-experimental),<break/>891 participants (range 19-274)</td><td align="left" valign="top">Chatbots trained in social persuasion and emotional connection.</td><td align="left" valign="top">Usual care, alternative controls.</td><td align="left" valign="top">NIH for interventions and NIH prepost. Biases due to small sample sizes and lack of longitudinal analysis.</td><td align="left" valign="top">Constrained (rule-based) and unconstrained chatbots (free input); integrated with graphics, images, and voice for interaction.</td><td align="left" valign="top">PRISMA suggests robust theoretical evaluation and consistent metrics.</td></tr><tr><td align="left" valign="top">Ogilvie et al [<xref ref-type="bibr" rid="ref85">85</xref>]</td><td align="left" valign="top">2022</td><td align="left" valign="top">Focuses on psychological support for people with substance use disorders.</td><td align="left" valign="top">6 studies (1 RCT, 5 qualitative or mixed), 3&#x2010;180 participants</td><td align="left" valign="top">Chatbots trained in psychology, designed for CBT<sup><xref ref-type="table-fn" rid="table5fn6">f</xref></sup> and motivational interviewing.</td><td align="left" valign="top">No comparator and standard care in RCT.</td><td align="left" valign="top">MMAT<sup><xref ref-type="table-fn" rid="table5fn7">g</xref></sup>. Biases due to small sample sizes and lack of active controls.</td><td align="left" valign="top">Text-based (NLP<sup><xref ref-type="table-fn" rid="table5fn8">h</xref></sup>) and big data for analyzing consumption patterns; integrated into apps and social networks.</td><td align="left" valign="top">Use PRISMA. Identifies the need for rigorous validation and ethical design.</td></tr><tr><td align="left" valign="top">Aggarwal et al [<xref ref-type="bibr" rid="ref86">86</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Focuses on behavioral changes in smoking cessation, diet, and adherence in adults and adolescents.</td><td align="left" valign="top">15 studies (4 RCT, 9 pre-post), 108,360 participants (range 20&#x2010;99217)</td><td align="left" valign="top">Chatbots trained in behavioral strategies such as CBT and motivational interviewing.</td><td align="left" valign="top">Standard care and untreated groups.</td><td align="left" valign="top">NIH tool. Moderate to high biases due to unvalidated measures.</td><td align="left" valign="top">NLP- and ML-based<sup><xref ref-type="table-fn" rid="table5fn9">i</xref></sup> chatbots integrated into apps, messaging platforms, and social robots.</td><td align="left" valign="top">Identifies lack of standardization in metrics and outcomes. Use PRISMA and CONSORT-AI<sup><xref ref-type="table-fn" rid="table5fn10">j</xref></sup>.</td></tr><tr><td align="left" valign="top">Webster et al [<xref ref-type="bibr" rid="ref87">87</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Focuses on genetic counseling for hereditary cancer in adults.</td><td align="left" valign="top">7 observational studies,<break/>&#x003E;50,000 interactions</td><td align="left" valign="top">Chatbots are not trained in psychology, but are designed to collect family histories and provide education.</td><td align="left" valign="top">No comparator</td><td align="left" valign="top">JBI<sup><xref ref-type="table-fn" rid="table5fn11">k</xref></sup> cross-sectional. Biases due to a lack of demographic description and confounders.</td><td align="left" valign="top">Text-based with NLP; integrated into apps, mobile tools, and electronic medical records.</td><td align="left" valign="top">Suggests improving study quality and new controlled studies; use PRISMA.</td></tr><tr><td align="left" valign="top">Bendotti et al [<xref ref-type="bibr" rid="ref88">88</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Focuses on smoking cessation in adult smokers.</td><td align="left" valign="top">5 RCT,<break/>58,796 participants (84&#x2010;57214)</td><td align="left" valign="top">Chatbots trained in psychology and behavioral strategies (CBT).</td><td align="left" valign="top">Apps without chatbots, standard care.</td><td align="left" valign="top">Cochrane RoB-2. Risks due to missing data and methodological deviations.</td><td align="left" valign="top">Mixed chatbots: rule-based and NLP, integrated into apps, social networks, and digital platforms.</td><td align="left" valign="top">USE PRISMA. Proposes CONSORT-AI<sup><xref ref-type="table-fn" rid="table5fn10">j</xref></sup> to improve consistency in reporting.</td></tr><tr><td align="left" valign="top">Singh et al [<xref ref-type="bibr" rid="ref89">89</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Focuses on behavioral changes in physical activity, diet, and sleep in adults and adolescents.</td><td align="left" valign="top">19 studies (11 RCT, 5 prepost, 2 nonrandomized), 3567 participants (25-958)</td><td align="left" valign="top">Chatbots trained in behavioral change theories.</td><td align="left" valign="top">Standard care and alternative groups.</td><td align="left" valign="top">EPHPP<sup><xref ref-type="table-fn" rid="table5fn12">l</xref></sup>: 14 weak studies, 4 moderate, and 1 strong.</td><td align="left" valign="top">Text, AI<sup><xref ref-type="table-fn" rid="table5fn13">m</xref></sup>, voice-based chatbots with graphics and avatars.</td><td align="left" valign="top">PRISMA recommends more rigorous designs and evaluations.</td></tr><tr><td align="left" valign="top">Noh et al [<xref ref-type="bibr" rid="ref90">90</xref>]</td><td align="left" valign="top">2023</td><td align="left" valign="top">Focuses on weight management in adults with obesity and overweight.</td><td align="left" valign="top">8 studies (3 RCT, 5 prepost), 712 participants (23-220)</td><td align="left" valign="top">Chatbots trained in psychology and personalization (CBT and individual goal-setting).</td><td align="left" valign="top">Alternative tutorials.</td><td align="left" valign="top">Cochrane RoB-2 and CASP<sup><xref ref-type="table-fn" rid="table5fn14">n</xref></sup> checklist. Bias in randomization and selective reporting.</td><td align="left" valign="top">Text-based (NLP and ML), one multimodal (text + voice), and big data for population-level adjustments.</td><td align="left" valign="top">Use PRISMA. Highlights the need for longer follow-ups and larger sample sizes.</td></tr><tr><td align="left" valign="top">Kim [<xref ref-type="bibr" rid="ref91">91</xref>]</td><td align="left" valign="top">2024</td><td align="left" valign="top">Focuses on mental, reproductive, and eating disorder health in women.</td><td align="left" valign="top">10 (7 RCT, 3 prepost), 21,537 participants (15&#x2010;19,643)</td><td align="left" valign="top">Chatbots trained in psychology are used for education, prevention, and psychological skill-building.</td><td align="left" valign="top">Waitlists and standard care.</td><td align="left" valign="top">Cochrane ROB-2. Biases in design, sample size, and intervention deviations.</td><td align="left" valign="top">Text-based with NLP and ML; integrated into apps and clinical environments.</td><td align="left" valign="top">Suggests metric standardization and methodological rigor. PRISMA.</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>RCT: randomized controlled trial.</p></fn><fn id="table5fn2"><p><sup>b</sup>RoB-2: Cochrane Risk of Bias 2 tool.</p></fn><fn id="table5fn3"><p><sup>c</sup>ROBINS-I: Risk Of Bias In Non-randomized Studies of Interventions.</p></fn><fn id="table5fn4"><p><sup>d</sup>NIH: National Institutes of Health.</p></fn><fn id="table5fn5"><p><sup>e</sup>PRISMA: Preferred Reporting Items for Systematic reviews and Meta-Analyses.</p></fn><fn id="table5fn6"><p><sup>f</sup>CBT: cognitive behavioral therapy.</p></fn><fn id="table5fn7"><p><sup>g</sup>MMAT: Mixed Methods Appraisal Tool.</p></fn><fn id="table5fn8"><p><sup>h</sup>NLP: natural language processing.</p></fn><fn id="table5fn9"><p><sup>i</sup>ML: machine learning.</p></fn><fn id="table5fn10"><p><sup>j</sup>CONSORT-AI: Consolidated Standards of Reporting Trials extension for  Artificial Intelligence.</p></fn><fn id="table5fn11"><p><sup>k</sup>JBI: Joanna Briggs Institute.</p></fn><fn id="table5fn12"><p><sup>l</sup>EPHPP: Effective Public Health Practice Project.</p></fn><fn id="table5fn13"><p><sup>m</sup>AI: artificial intelligence.</p></fn><fn id="table5fn14"><p><sup>n</sup>CASP: Critical Appraisal Skills Programme.</p></fn></table-wrap-foot></table-wrap><p>The data obtained and used in this scoping review have been submitted earlier [<xref ref-type="bibr" rid="ref92">92</xref>].</p></sec><sec id="s3-3"><title>Tools for Critical Appraisal and Associated Constructs</title><p>Of the 46 identified tools (<xref ref-type="table" rid="table2">Table 2</xref>), 26 were guides for reporting AI studies, 16 were critical appraisal tools, 2 were tools for the assessment of study quality, and 2 were protocols for tools assessing risk of bias (refer to ). Most of these tools (44) focused on classical predictive AI. Only 2 were oriented toward chatbot assessment studies: one, Quality Analysis of Medical Artificial Intelligence (QAMAI [<xref ref-type="bibr" rid="ref66">66</xref>]), was designed to assess the quality of AI chatbots, and the other was a protocol for reporting this type of study [<xref ref-type="bibr" rid="ref67">67</xref>].</p><p>With respect to the type of publication, most of these tools (41) were original. In 5 cases, the published tool was associated with a systematic review&#x2014;in some cases developed to create the tool [<xref ref-type="bibr" rid="ref54">54</xref>], and in others related to the assessment of included studies [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. In other cases, it was part of a classic review in a journal [<xref ref-type="bibr" rid="ref61">61</xref>] or in a book chapter [<xref ref-type="bibr" rid="ref53">53</xref>].</p><p>Regarding clinical setting or specialty, 26 tools were designed for general clinical purposes. Eleven tools were developed for medical imaging or radiology and focused on image quality or diagnosis. Dermatology and traumatology had 2 tools each, and cardiology, radiation oncology, urology, oral health, and otorhinolaryngology-head and neck surgery had one tool each.</p><p><xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref> show the important historical aggregation of reporting and critical appraisal tools, particularly in 2021 and 2022, with a renewed increase in 2023 and 2024.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Constructs and year of publication as artificial intelligence (AI) tools.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e77110_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Constructs, year of publication, and name of tools.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e77110_fig03.png"/></fig><p>Reporting tools are predominant, which is unsurprising, as they are a prerequisite to assess other dimensions and represent the first step to reach consistency. These reporting tools are for different designs: randomized controlled trial publications and protocols (CONSORT-AI, Consolidated Standards of Reporting Trials extension for Artificial Intelligence and SPIRIT, Standard Protocol Items: Recommendations for Interventional Trials), respectively, diagnostic accuracy studies (STARD-AI, Standards for Reporting of Diagnostic Accuracy Studies for AI), and Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis using AI (TRIPOD-AI). Other tools focused on medical images or other specialties (Checklist for Artificial Intelligence in Medical Imaging, CLAIM; Minimum Information for Medical AI Reporting, MINIMAR; Proposed Requirements for Cardiovascular Imaging Related MI Evaluation, PRIME; Standardized Reporting of ML Applications in Urology, STREAM-URO; Clinical Artificial Intelligence Research, CAIR, etc) and the PRISMA-AI protocol focused on the systematic review of AI studies (refer to acronyms in <xref ref-type="table" rid="table6">Table 6</xref>).</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Critical appraisal tools: name, acronym, and meaning or explanation.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Author, year</td><td align="left" valign="bottom">Name or acronym</td><td align="left" valign="bottom">Development, explanation, or meaning</td></tr></thead><tbody><tr><td align="left" valign="top">Luo et al, 2016 [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">Luo</td><td align="left" valign="top">Guidelines for developing and reporting machine learning (ML) predictive models.</td></tr><tr><td align="left" valign="top">Lambin et al, 2017 [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">Radiomics Quality Score (RQS)</td><td align="left" valign="top">Radiomics quality score.</td></tr><tr><td align="left" valign="top">Qiao, 2019 [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">Qiao</td><td align="left" valign="top">Checklist for studies of ML.</td></tr><tr><td align="left" valign="top">Liu et al, 2019 [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">Liu</td><td align="left" valign="top">Checklist for studies of ML.</td></tr><tr><td align="left" valign="top">Vollmer et al, 2019 [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">TREE</td><td align="left" valign="top">ML: 20 critical questions on transparency, replicability, ethics, and effectiveness.</td></tr><tr><td align="left" valign="top">Cruz Rivera et al, 2020 [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">SPIRIT-AI</td><td align="left" valign="top">The Standard Protocol Items: Recommendations for Interventional Trials involving Artificial Intelligence.</td></tr><tr><td align="left" valign="top">Faes et al, 2020 [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">Faes</td><td align="left" valign="top">Critical appraisal of ML studies.</td></tr><tr><td align="left" valign="top">Hernandez-Boussard et al, 2020 [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">MINIMAR</td><td align="left" valign="top">MINimum Information for Medical AI Reporting: developing reporting standards for AI in health care.</td></tr><tr><td align="left" valign="top">Liu et al, 2020 [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">CONSORT-AI</td><td align="left" valign="top">Consolidated Standards of Reporting Trials extension for AI.</td></tr><tr><td align="left" valign="top">Mongan et al, 2020 [<xref ref-type="bibr" rid="ref31">31</xref>]</td><td align="left" valign="top">CLAIM</td><td align="left" valign="top">Checklist for AI in Medical Imaging.</td></tr><tr><td align="left" valign="top">Norgeot et al, 2020 [<xref ref-type="bibr" rid="ref32">32</xref>]</td><td align="left" valign="top">MI-CLAIM</td><td align="left" valign="top">Minimum Information about Clinical AI Modeling: the MI-CLAIM checklist.</td></tr><tr><td align="left" valign="top">Sengupta et al, 2020 [<xref ref-type="bibr" rid="ref33">33</xref>]</td><td align="left" valign="top">PRIME</td><td align="left" valign="top">Proposed Requirements for Cardiovascular Imaging-Related ML Evaluation.</td></tr><tr><td align="left" valign="top">Stevens et al, 2020 [<xref ref-type="bibr" rid="ref34">34</xref>]</td><td align="left" valign="top">Stevens</td><td align="left" valign="top">Recommendations for reporting ML analyses in clinical research.</td></tr><tr><td align="left" valign="top">Cabitza and Campagner, 2021 [<xref ref-type="bibr" rid="ref35">35</xref>]</td><td align="left" valign="top">IJMEDI checklist</td><td align="left" valign="top">International Journal of Medical Informatics checklist for studies of ML.</td></tr><tr><td align="left" valign="top">El Naqua et al 2021 [<xref ref-type="bibr" rid="ref36">36</xref>]</td><td align="left" valign="top">CLAMP</td><td align="left" valign="top">Checklist for AI in medical physics.</td></tr><tr><td align="left" valign="top">Kwong et al, 2021 [<xref ref-type="bibr" rid="ref37">37</xref>]</td><td align="left" valign="top">STREAM-URO</td><td align="left" valign="top">The Standardized Reporting of ML Applications in Urology framework.</td></tr><tr><td align="left" valign="top">Meshaka et al, 2021 [<xref ref-type="bibr" rid="ref38">38</xref>]</td><td align="left" valign="top">CLAIM-Pediatrics<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> Rx</td><td align="left" valign="top">AI research reporting guidelines relevant to the pediatric radiologist (CLAIM adaptation).</td></tr><tr><td align="left" valign="top">Olczak et al, 2021 [<xref ref-type="bibr" rid="ref39">39</xref>]</td><td align="left" valign="top">CAIR</td><td align="left" valign="top">Clinical AI Research checklist.</td></tr><tr><td align="left" valign="top">Schwendicke et al, 2021 [<xref ref-type="bibr" rid="ref40">40</xref>]</td><td align="left" valign="top">Schwendicke</td><td align="left" valign="top">AI in dental research: checklist.</td></tr><tr><td align="left" valign="top">Sounderajah et al, 2021 [<xref ref-type="bibr" rid="ref41">41</xref>]</td><td align="left" valign="top">QUADAS-AI</td><td align="left" valign="top">Quality Assessment of Diagnostic Accuracy Studies for Artificial Intelligence.</td></tr><tr><td align="left" valign="top">Sounderajah et al, 2021 [<xref ref-type="bibr" rid="ref42">42</xref>]</td><td align="left" valign="top">STARD-AI</td><td align="left" valign="top">Standards for Reporting of Diagnostic Accuracy Studies for AI.</td></tr><tr><td align="left" valign="top">Vinny et al, 2021 [<xref ref-type="bibr" rid="ref43">43</xref>]</td><td align="left" valign="top">Vinny</td><td align="left" valign="top">Critical appraisal of ML.</td></tr><tr><td align="left" valign="top">Collins et al, 2021 [<xref ref-type="bibr" rid="ref44">44</xref>]</td><td align="left" valign="top">PROBAST-AI</td><td align="left" valign="top">Prediction model Risk Of Bias Assessment Tool for AI studies.</td></tr><tr><td align="left" valign="top">Al-Zaiti et al, 2022 [<xref ref-type="bibr" rid="ref45">45</xref>]</td><td align="left" valign="top">ROBUST-ML</td><td align="left" valign="top">Ruling Out Bias Using Standard Tools in ML.</td></tr><tr><td align="left" valign="top">Daneshjou et al, 2022 [<xref ref-type="bibr" rid="ref46">46</xref>]</td><td align="left" valign="top">CLEAR/DERM</td><td align="left" valign="top">Checklist for Evaluation of Image-Based AI Algorithm Reports in Dermatology.</td></tr><tr><td align="left" valign="top">Haller et al, 2022 [<xref ref-type="bibr" rid="ref47">47</xref>]</td><td align="left" valign="top">R-AI-DIOLOGY</td><td align="left" valign="top">Checklist for evaluation of AI tools in clinical neuroradiology.</td></tr><tr><td align="left" valign="top">Jha et al, 2022 [<xref ref-type="bibr" rid="ref48">48</xref>]</td><td align="left" valign="top">RELAINCE</td><td align="left" valign="top">Recommendations for Evaluation of AI for Nuclear Medicine.</td></tr><tr><td align="left" valign="top">Padula et al, 2022 [<xref ref-type="bibr" rid="ref49">49</xref>]</td><td align="left" valign="top">PALISADE</td><td align="left" valign="top">ML in Health Economics and Outcomes Research: Purpose, Appropriateness, Limitations, Implementation, Sensitivity, Algorithm characteristics, Data characteristics, and Explainability.</td></tr><tr><td align="left" valign="top">Van Smeden et al, 2022 [<xref ref-type="bibr" rid="ref50">50</xref>]</td><td align="left" valign="top">Van-Smeden</td><td align="left" valign="top">Critical appraisal of AI-based prediction models for cardiovascular disease.</td></tr><tr><td align="left" valign="top">Vasey et al, 2022 [<xref ref-type="bibr" rid="ref51">51</xref>]</td><td align="left" valign="top">DECIDE-AI</td><td align="left" valign="top">Reporting guideline for early-stage clinical evaluation of decision support systems driven by AI.</td></tr><tr><td align="left" valign="top">Jones et al, 2022 [<xref ref-type="bibr" rid="ref52">52</xref>]</td><td align="left" valign="top">Jones</td><td align="left" valign="top">Checklist for evaluation of AI and ML for triage or detection of possible skin cancers.</td></tr><tr><td align="left" valign="top">Cabello, 2022 [<xref ref-type="bibr" rid="ref53">53</xref>]</td><td align="left" valign="top">CASPE-AI</td><td align="left" valign="top">Critical Appraisal of Studies using Predictive Evidence-AI.</td></tr><tr><td align="left" valign="top">Cacciamani et al, 2023 [<xref ref-type="bibr" rid="ref54">54</xref>]</td><td align="left" valign="top">PRISMA-AI</td><td align="left" valign="top">Preferred Reporting Items for Systematic reviews and Meta-Analyses extension for AI.</td></tr><tr><td align="left" valign="top">Alberich et al 2023 [<xref ref-type="bibr" rid="ref55">55</xref>]</td><td align="left" valign="top">MAIC-10</td><td align="left" valign="top">Must AI Criteria&#x2010;10: quality checklist for publications using AI and medical images.</td></tr><tr><td align="left" valign="top">Kocak et al, 2023 [<xref ref-type="bibr" rid="ref56">56</xref>]</td><td align="left" valign="top">CLEAR</td><td align="left" valign="top">CheckList for Evaluation of Radiomics research.</td></tr><tr><td align="left" valign="top">Kwong et al, 2023 [<xref ref-type="bibr" rid="ref57">57</xref>]</td><td align="left" valign="top">APPRAISE-AI</td><td align="left" valign="top">Tool for quantitative evaluation of AI studies for clinical decision support.</td></tr><tr><td align="left" valign="top">Park et al, 2023 [<xref ref-type="bibr" rid="ref58">58</xref>]</td><td align="left" valign="top">Park</td><td align="left" valign="top">Critical appraisal: 10 key items for radiologists to check when reading publications of clinical research on AI.</td></tr><tr><td align="left" valign="top">FDA, 2023 [<xref ref-type="bibr" rid="ref59">59</xref>]</td><td align="left" valign="top">FDA</td><td align="left" valign="top">Ten guiding principles for developing good ML practices.</td></tr><tr><td align="left" valign="top">Collins GS, 2024 [<xref ref-type="bibr" rid="ref60">60</xref>]</td><td align="left" valign="top">TRIPOD-AI</td><td align="left" valign="top">Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis Or Diagnosis using regression or ML methods.</td></tr><tr><td align="left" valign="top">Du Toit et al, 2023 [<xref ref-type="bibr" rid="ref61">61</xref>]</td><td align="left" valign="top">HUMANE</td><td align="left" valign="top">ML Analytics Network survey questionnaire for hypertension studies</td></tr><tr><td align="left" valign="top">Cote and Lubowitz, 2024 [<xref ref-type="bibr" rid="ref62">62</xref>]</td><td align="left" valign="top">Cote</td><td align="left" valign="top">Recommended requirements and essential elements for proper reporting of the use of AI and ML tools.</td></tr><tr><td align="left" valign="top">Kocak et al, 2024 [<xref ref-type="bibr" rid="ref63">63</xref>]</td><td align="left" valign="top">METRICS</td><td align="left" valign="top">METhodological RadiomICs Score.</td></tr><tr><td align="left" valign="top">Lekadir et al, 2024 [<xref ref-type="bibr" rid="ref64">64</xref>]</td><td align="left" valign="top">FUTURE-AI</td><td align="left" valign="top">Guiding principles and consensus recommendations for trustworthy AI.</td></tr><tr><td align="left" valign="top">Scott et al, 2024 [<xref ref-type="bibr" rid="ref65">65</xref>]</td><td align="left" valign="top">Scott</td><td align="left" valign="top">Checklist for assessing suitability of ML applications.</td></tr><tr><td align="left" valign="top">Vaira et al, 2024 [<xref ref-type="bibr" rid="ref66">66</xref>]</td><td align="left" valign="top">QUAMAI (ChatGPT)</td><td align="left" valign="top">Validation of the Quality Analysis of Medical AI (QAMAI) tool.</td></tr><tr><td align="left" valign="top">CHART Collabor, 2024 [<xref ref-type="bibr" rid="ref67">67</xref>]</td><td align="left" valign="top">CHART (Chatbots)</td><td align="left" valign="top">Protocol for the development of the Chatbot Assessment Reporting Tool (CHART) for clinical advice.</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>CLAIM: Checklist for Artificial Intelligence in Medical Imaging.</p></fn></table-wrap-foot></table-wrap><p>The number of tools for critical appraisal has significantly increased since 2022. They include aspects of the relevance of clinical questions and the clinical context of the technology, but also the necessary reflections about how to apply the results to the clinical setting or clinical decisions. Most of these papers have a teaching purpose (explicit or implicit) and often include comprehensive reviews (systematic or classical) of AI techniques with glossaries or taxonomic suggestions.</p><p>Only two tools are focused on the risk of bias. Both focus on predictive AI (Quality Assessment of Diagnostic Test Accuracy Studies for Artificial Intelligence, QUADAS-AI) and Prediction Model Risk Of Bias Assessment Tool for AI studies, (PROBAST-AI), and both are AI extensions of classical tools for diagnosis accuracy studies and prognosis studies, respectively, and, at the time of writing, are not yet available (they are under construction or exist as protocols). Regarding the quality study, we found 2 instruments: Methodological Radiomics Score (METRICS), which is a recent tool designed to assess the quality of radiomic studies, and the above-mentioned QAMAI, which aims to assess the quality of health information offered by AI chatbots trained in otorhinolaryngology. This latter is inspired by mDISCERN, which is a well-validated and widely used tool for assessing the quality of health information from websites [<xref ref-type="bibr" rid="ref93">93</xref>].</p><p>With respect to the methods used to develop the tools (<xref ref-type="fig" rid="figure4">Figure 4</xref>), a comprehensive strategy including a systematic review of the literature and a formal Delphi process was used in 15 cases. In 6 tools, the method was only partially described, and in 23, there was no sufficient description of the methods used, so we assume that they were developed by experts. There were, on average, 21 items for critical appraisal tools, 30 for study quality tools, and 29 for reporting guides. There was no information about the above-mentioned protocols for risk of bias.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Design style for each type of constructs.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e77110_fig04.png"/></fig></sec><sec id="s3-4"><title>Bias and Bias Mitigation</title><p>Nine papers addressed the issue of bias or bias mitigation in AI studies. Three of them focused specifically on bias classification: in 2 cases in an exhaustive manner [<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref69">69</xref>] and in another case from a more general view [<xref ref-type="bibr" rid="ref73">73</xref>].</p><p>Three of the obtained papers are oriented to bias and mitigation from a specific clinical or technological perspective: in 1 case only for ML [<xref ref-type="bibr" rid="ref75">75</xref>], and in the 2 remaining cases from a nephrology [<xref ref-type="bibr" rid="ref74">74</xref>] or public health perspective [<xref ref-type="bibr" rid="ref76">76</xref>].</p><p>We also found 3 very relevant papers: one [<xref ref-type="bibr" rid="ref70">70</xref>] is a comprehensive classic review focused on bias classification and explanation, including how to design strategies for bias mitigation. Another [<xref ref-type="bibr" rid="ref71">71</xref>] is an official publication from the US National Institute of Standards and Technology, which addresses the definition of standards in bias taxonomy and classification of their categories and suggests a guide to management and mitigation of bias in the AI context. The last one [<xref ref-type="bibr" rid="ref72">72</xref>] is a systematic review of computational bias, with a precise description of fairness metrics and a synthesis of strategies for bias mitigation. In addition, the review provides a catalog of software tools and libraries for helping developers and users to explore the issue of fairness and bias in AI.</p><p>In addition, many of the retrieved papers classified in our scoping review as critical appraisal tools also included bias classification, bias mitigation, or glossaries of clinical AI terms [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref52">52</xref>].</p></sec><sec id="s3-5"><title>Chatbot Assessment Studies</title><p>We identified 15 studies related to chatbot assessment studies. Six of them are primary research studies of chatbot assessments (<xref ref-type="table" rid="table4">Table 4</xref>), and 9 are systematic reviews in which chatbots are compared with other interventions (<xref ref-type="table" rid="table5">Table 5</xref>). In 5 of the primary studies, a nonclinical study population consisting of sets of questions, scenarios, vignettes, or a bank of standard questions was used [<xref ref-type="bibr" rid="ref77">77</xref>,<xref ref-type="bibr" rid="ref78">78</xref>,<xref ref-type="bibr" rid="ref80">80</xref>-<xref ref-type="bibr" rid="ref82">82</xref>]. In the other primary study [<xref ref-type="bibr" rid="ref79">79</xref>], the study population was a group of clinicians randomly allocated to view 1 of 2 videos with clinical scenarios, and the clinicians&#x2019; answers were evaluated before and after a chatbot interaction (ChatGPT). With respect to the study design, in 3 cases, the objective was to assess chatbot performance [<xref ref-type="bibr" rid="ref77">77</xref>,<xref ref-type="bibr" rid="ref78">78</xref>,<xref ref-type="bibr" rid="ref80">80</xref>], in 2 cases, the aim was to compare performance between different chatbots [<xref ref-type="bibr" rid="ref81">81</xref>,<xref ref-type="bibr" rid="ref82">82</xref>], and in the above-mentioned study involving clinicians, the aim was the exploration of changes in clinical answers after chatbot interaction in a before-and-after scheme [<xref ref-type="bibr" rid="ref79">79</xref>]. All 6 studies assessed modern chatbots (generative chatbots), and all of them mentioned challenges in this area in the discussion (eg, inconsistency in answers, low transparency, &#x201C;hallucinations&#x201D; [when AI models produce incorrect or misleading results], and rates of inappropriate responses). All agreed on the need for specific health care&#x2013;trained interfaces.</p><p>We found 9 systematic reviews (<xref ref-type="table" rid="table5">Table 5</xref>). Two were published in 2021, so the chatbots used were not LLM chatbots (ie, older chatbots) [<xref ref-type="bibr" rid="ref83">83</xref>,<xref ref-type="bibr" rid="ref84">84</xref>], and the other 7 included modern LLM (generative chatbots) with several study designs: Two included only randomized controlled trials [<xref ref-type="bibr" rid="ref88">88</xref>,<xref ref-type="bibr" rid="ref91">91</xref>], 5 included randomized or quasi-experimental studies, or both [<xref ref-type="bibr" rid="ref83">83</xref>,<xref ref-type="bibr" rid="ref84">84</xref>,<xref ref-type="bibr" rid="ref86">86</xref>,<xref ref-type="bibr" rid="ref89">89</xref>,<xref ref-type="bibr" rid="ref90">90</xref>], and 1 included qualitative/mixed-methods studies [<xref ref-type="bibr" rid="ref85">85</xref>]. The last one was a systematic review of observational studies focused on counseling for hereditary cancer in selected at-risk adults [<xref ref-type="bibr" rid="ref87">87</xref>]. The tools used to assess study quality depended on the designs included in the systematic review, so different classic tools were used: Cochrane Risk of Bias 2 (Cochrane Collaboration) in 4 cases [<xref ref-type="bibr" rid="ref83">83</xref>,<xref ref-type="bibr" rid="ref87">87</xref>,<xref ref-type="bibr" rid="ref90">90</xref>,<xref ref-type="bibr" rid="ref91">91</xref>], NIH tools for experimental and observational designs in 2 cases [<xref ref-type="bibr" rid="ref86">86</xref>,<xref ref-type="bibr" rid="ref94">94</xref>], JBI&#x2013;cross-sectional tool for observational studies [<xref ref-type="bibr" rid="ref87">87</xref>], and other tools for other pre-post and qualitative designs.</p><p>The questions for these reviews are provided in <xref ref-type="table" rid="table5">Table 5</xref>. Most concerned counseling is associated with treatment; two were oriented toward weight loss management, different addictions, and reproductive health counseling. In 5 cases, the intervention was a chatbot trained in psychology (sometimes cognitive behavioral therapy) compared with standard care. Finally, regarding the study report, most of the included trials used the CONSORT classic guideline, although CONSORT-AI was published in 2020. However, CONSORT-AI was mentioned in two systematic reviews [<xref ref-type="bibr" rid="ref86">86</xref>,<xref ref-type="bibr" rid="ref88">88</xref>]. For systematic review reporting, all included studies used PRISMA classic, which is reasonable because PRISMA-AI was published in 2023.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We conducted a comprehensive scoping review and identified 70 papers corresponding to the 3 proposed questions: tools for critical appraisal, bias and bias mitigation, and chatbot assessment studies. Although critical appraisal tools were the main objective of this review, AI types of bias were also included because validity (or absence of bias) is an important component of critical appraisal. Chatbot studies were included because they represent an important, recent, and disruptive technology. The three areas together map the current landscape of evidence in the critical appraisal of clinical AI studies.</p><p>We selected critical appraisal as the main domain for this review because it is a wider and more inclusive concept than risk of bias, quality, or reporting, and it is more related to clinical practice. This decision implied a change in the published protocol and was adopted after discussion.</p><p>Reporting guides are essential for authors in writing their studies and for editors in maintaining consistency across publications. In fact, they are a prerequisite for adequate reading. Critical appraisal tools are more focused on making judgments about the validity and applicability of the evidence, and they usually have a diffusion or teaching purpose. A paper can be perfectly reported; it may even be valid, yet be of no use in a clinical setting. Finally, risk of bias and quality are very precise concepts, and their tools are complex and designed as far as possible to avoid inconsistencies, so they are more suitable tools for research syntheses. Nevertheless, reporting, critical appraisal, risk of bias, and quality form a cluster of closely related constructs with overlapping areas.</p></sec><sec id="s4-2"><title>Comparison With Previous Work</title><p>Adequate reporting varies according to the structure or the type of study we are addressing, and is not only an editorial requirement but also part of study quality. Obviously, good reporting is a precondition to assess study quality, but there is also empirical evidence that some reporting flaws (or nonreporting) are associated with bias in the effect estimation [<xref ref-type="bibr" rid="ref94">94</xref>,<xref ref-type="bibr" rid="ref95">95</xref>]. Therefore, exploring the reporting is essential to judge the validity of any study, as it facilitates study replication, risk of bias or quality assessments, interpretation of the results, and judgment of the value and applicability of the results in real clinical settings for individualized or collective decisions. It is also necessary to include and assess studies in systematic reviews and to evaluate the systematic review itself. Thus, it is part of the critical appraisal process [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref96">96</xref>,<xref ref-type="bibr" rid="ref97">97</xref>].</p><p>This overlap between reporting and critical appraisal has been a source of inconsistencies between raters when classifying papers in this scoping review. Iterative discussions were necessary to reach an agreement. The most important criterion we used to classify the papers within the critical appraisal category was the relevance of the question in the clinical context and a clear intent to help in applicability.</p><p>Quality of study and risk of bias have been used interchangeably, but quality is a descriptive approach to methodological characteristics that may have a possible influence on the effect estimation (called safeguards), whereas risk of bias is an empirical judgment (guided by methodological signaling and criteria) about a possible bias in a particular effect estimation. This new construct of risk of bias is expressed as low, high, or moderate. Currently, the risk of bias is more commonly used than quality [<xref ref-type="bibr" rid="ref98">98</xref>].</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>With respect to evidence search, the strategy and the sources are sensitive enough to identify the existing tools for critical appraisal and related constructs. Only 10 papers escaped our formal search strategy and were retrieved by other methods. In addition, we carried out a special effort to search for AI tools in systematic reviews of AI studies during the full-text screening phase. Therefore, we believe that this study is sensitive in capturing the evidence about AI critical appraisal tools.</p><p>As for the selection of sources of evidence for AI tools and data charting (with their implicit value judgment in classifying tools), the iterative process of consensus is a consistent strategy.</p><p>For bias in AI and bias mitigation, the search strategy was able to identify the main papers about bias classification and mitigation, although it was not specifically designed for this purpose. We are aware of the enormous number of existing publications on each specific bias. However, the retrieved papers give us an adequate representation of them, which will allow us to make forward and backward &#x201C;snowballing&#x201D; to collect the relevant evidence for future concept analysis studies.</p><p>Clearly, chatbot assessment studies constitute a special group in this review, which is full of difficulties. First, they are not a MeSH term yet, so controlled language cannot be used. We have used a reasonable strategy based on free text, synonyms, and truncations, but it may be improved in future updates with the appropriate MeSH term and by using the search strategy of new systematic reviews about chatbots and the use of semantic search technologies. To balance this, we decided to have flexible inclusion criteria. Second, chatbot assessment studies are heterogeneous and inconsistent in the design, analysis, and reporting, so we used ChatGPT-4o for data extraction; however, all outputs were independently reviewed by 2 authors against the original papers, and no major corrections to the extracted information were required. Third, the study populations are variable and are based on preclinical scenarios, vignettes, or a set of questions that lie at the frontier of real clinical practice. On the other hand, the aggregation of systematic reviews on chatbots is not exhaustive but may be considered a detailed and up-to-date list of this type of study, their main characteristics, and the tools used for reporting the individual studies and the reviews, and for assessing the risk of bias. In this sense, a recent review shows results consistent with our study [<xref ref-type="bibr" rid="ref99">99</xref>].</p><p>There are some limitations to this scoping review. The search strategy followed a general approach for all the questions of the study, but was primarily guided by the main question and was not specifically designed for bias and bias mitigation or chatbot assessment. On the other hand, the absence of MeSH for chatbot studies and the heterogeneity of objectives, questions, designs, devices, and analyses make it very difficult to search for this type of study. In addition, the methods used to organize the data extraction have a potential limitation due to the novelty of applying LLMs in evidence synthesis, as formal standards for their integration are still under development. Finally, this field is evolving very quickly, so many of the conclusions about the existing evidence have a limited period of validity.</p></sec><sec id="s4-4"><title>Implications for Research</title><p>There is a vast array of tools available, with 2 clear aggregations in the areas of reporting guides and critical appraisal tools. Thus, the newly arising question is: What is the best tool for a particular setting or specific purpose? At the same time, there are some gaps in knowledge identified in this scoping review. These aggregations and the existing gaps have implications for research and for clinical practice.</p><p>Reporting guides have been recently synthesized in a systematic review [<xref ref-type="bibr" rid="ref100">100</xref>] that also includes tools for basic and laboratory research, and whose search ended in 2022. This topic should be harmonized, and the review should probably be either updated or reformulated from a clinical standpoint.</p><p>Similarly, critical appraisal tools are enormously varied and full of different nuances and approaches, so selecting one of them can be very challenging. We believe that the topic deserves a qualitative synthesis to clarify the key elements for choosing.</p><p>New risk of bias tools for AI in prognosis and diagnosis (QUADAS-AI and PROBAST-AI), as well as a PRISMA-AI extension for systematic reviews, are expected, as well as the Chatbot Assessment Reporting Tool (CHART), for reporting chatbot assessment studies. The AI extension of other classic tools, such as Cochrane Risk of Bias and ROBINS, among others, should be considered.</p><p>Finally, the development of standards for the design, reporting, and assessment of chatbot assessment studies and chatbot health advisory studies is a clear gap in our toolbox and needs to be addressed.</p></sec><sec id="s4-5"><title>Implications for Clinical Practice</title><p>In the realm of clinical practice, it is important to clarify the appropriate selection of adequate tools for critical appraisal, and it is essential to develop teaching strategies for the dissemination of skills for the critical appraisal of AI studies, including knowledge about the types of bias to be tackled [<xref ref-type="bibr" rid="ref101">101</xref>].</p></sec><sec id="s4-6"><title>Conclusion</title><p>&#x201C;We can only see a small distance ahead, but we can see plenty that needs to be done&#x201D; [<xref ref-type="bibr" rid="ref102">102</xref>].</p></sec></sec></body><back><ack><p>This project was designed and developed as one of the first steps toward integrating AI education in clinical settings. A hybrid approach was used for data extraction for chatbot studies, combining the active involvement of a researcher with a fully supervised ChatGPT-RAG model. However, AI was not used to write or compose this manuscript.</p></ack><fn-group><fn fn-type="con"><p>AJB was responsible for the writing and editing of the paper. EAZ designed and undertook the search and participated in writing the paper. JBC designed the study, participated in the screening, data charting, and the consensus process, designed the analysis, and contributed to writing and editing the paper. IUB participated in screening, data charting, and the consensus process. JIEK participated in screening and the consensus process. JIPZ participated in screening and the consensus process. LRR approved the protocol and participated in screening, data charting, the consensus process, and writing the paper. MMF contributed to the design and participated in screening, data charting, writing, and editing the paper. MUC designed and undertook the search and participated in writing the paper. MTI participated in the screening. MTGS participated in screening, data charting, and the consensus process, and participated in the analysis and editing of the paper. VRG participated in designing the protocol, screening, data charting, and the consensus process, and participated in the analysis and editing of the paper.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CAIR</term><def><p>Clinical Artificial Intelligence Research</p></def></def-item><def-item><term id="abb3">CHART</term><def><p>Chatbot Assessment Reporting Tool</p></def></def-item><def-item><term id="abb4">CLAIM</term><def><p>Checklist for Artificial Intelligence in Medical Imaging</p></def></def-item><def-item><term id="abb5">CONSORT-AI</term><def><p>Consolidated Standards of Reporting Trials extension for Artificial Intelligence</p></def></def-item><def-item><term id="abb6">JBI</term><def><p>Joanna Briggs Institute</p></def></def-item><def-item><term id="abb7">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb8">MeSH</term><def><p>Medical Subject Headings</p></def></def-item><def-item><term id="abb9">METRICS</term><def><p>METhodological RadiomICs Score</p></def></def-item><def-item><term id="abb10">MINIMAR</term><def><p>MINimum Information for Medical AI Reporting</p></def></def-item><def-item><term id="abb11">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb12">PRIME</term><def><p>Proposed Requirements for Cardiovascular Imaging Related Machine Learning Evaluation</p></def></def-item><def-item><term id="abb13">PRISMA</term><def><p>Preferred Reporting Items for Systematic reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb14">PRISMA-ScR</term><def><p>Preferred Reporting Items for Systematic reviews and Meta-Analyses extension for Scoping Reviews</p></def></def-item><def-item><term id="abb15">PROBAST-AI</term><def><p>Prediction model Risk Of Bias ASsessment Tool for AI studies</p></def></def-item><def-item><term id="abb16">QAMAI</term><def><p>Quality Analysis of Medical Artificial Intelligence</p></def></def-item><def-item><term id="abb17">QUADAS-AI</term><def><p>Quality Assessment of Diagnostic Accuracy Studies for Artificial Intelligence</p></def></def-item><def-item><term id="abb18">SPIRIT-AI</term><def><p>Standard Protocol Items: Recommendations for Interventional Trials involving Artificial Intelligence</p></def></def-item><def-item><term id="abb19">STREAM-URO</term><def><p>Standardized Reporting of ML Applications in Urology</p></def></def-item><def-item><term id="abb20">TRIPOD-AI</term><def><p>Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis using AI</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hultcrantz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rind</surname><given-names>D</given-names> </name><name name-style="western"><surname>Akl</surname><given-names>EA</given-names> </name><etal/></person-group><article-title>The GRADE Working Group clarifies the construct of certainty of evidence</article-title><source>J Clin Epidemiol</source><year>2017</year><month>07</month><volume>87</volume><fpage>4</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2017.05.006</pub-id><pub-id pub-id-type="medline">28529184</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>McCarthy</surname><given-names>J</given-names> </name><name name-style="western"><surname>Minsky</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Rochester</surname><given-names>N</given-names> </name><name name-style="western"><surname>Shannon</surname><given-names>CE</given-names> </name></person-group><article-title>A proposal for the dartmouth summer research project on artificial intelligence</article-title><source>Stanford University</source><year>1955</year><access-date>2025-11-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www-formal.stanford.edu/jmc/history/dartmouth/dartmouth.html">https://www-formal.stanford.edu/jmc/history/dartmouth/dartmouth.html</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaul</surname><given-names>V</given-names> </name><name name-style="western"><surname>Enslin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gross</surname><given-names>SA</given-names> </name></person-group><article-title>History of artificial intelligence in medicine</article-title><source>Gastrointest Endosc</source><year>2020</year><month>10</month><volume>92</volume><issue>4</issue><fpage>807</fpage><lpage>812</lpage><pub-id pub-id-type="doi">10.1016/j.gie.2020.06.040</pub-id><pub-id pub-id-type="medline">32565184</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name></person-group><article-title>Injecting artificial intelligence into medicine</article-title><source>NEJM AI</source><year>2024</year><month>01</month><volume>1</volume><issue>1</issue><pub-id pub-id-type="doi">10.1056/AIe2300197</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jayakumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sounderajah</surname><given-names>V</given-names> </name><name name-style="western"><surname>Normahani</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Quality assessment standards in artificial intelligence diagnostic accuracy systematic reviews: a meta-research study</article-title><source>NPJ Digit Med</source><year>2022</year><month>01</month><day>27</day><volume>5</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00544-y</pub-id><pub-id pub-id-type="medline">35087178</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quirk</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mac Donnchadha</surname><given-names>C</given-names> </name><name name-style="western"><surname>Vaantaja</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Future implications of artificial intelligence in lung cancer screening: a systematic review</article-title><source>BJR Open</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>tzae035</fpage><pub-id pub-id-type="doi">10.1093/bjro/tzae035</pub-id><pub-id pub-id-type="medline">39444460</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chakraborty</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bhattacharya</surname><given-names>M</given-names> </name><name name-style="western"><surname>Dash</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SS</given-names> </name></person-group><article-title>Overview of chatbots with special emphasis on artificial intelligence-enabled ChatGPT in medical science</article-title><source>Front Artif Intell</source><year>2023</year><volume>6</volume><fpage>1237704</fpage><pub-id pub-id-type="doi">10.3389/frai.2023.1237704</pub-id><pub-id pub-id-type="medline">38028668</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fleuren</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Klausch</surname><given-names>TLT</given-names> </name><name name-style="western"><surname>Zwager</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Machine learning for the prediction of sepsis: a systematic review and meta-analysis of diagnostic test accuracy</article-title><source>Intensive Care Med</source><year>2020</year><month>03</month><volume>46</volume><issue>3</issue><fpage>383</fpage><lpage>400</lpage><pub-id pub-id-type="doi">10.1007/s00134-019-05872-y</pub-id><pub-id pub-id-type="medline">31965266</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barker</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Stone</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Sears</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Revising the JBI quantitative critical appraisal tools to improve their applicability: an overview of methods and the development process</article-title><source>JBI Evid Synth</source><year>2023</year><month>03</month><day>1</day><volume>21</volume><issue>3</issue><fpage>478</fpage><lpage>493</lpage><pub-id pub-id-type="doi">10.11124/JBIES-22-00125</pub-id><pub-id pub-id-type="medline">36121230</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Crossnohere</surname><given-names>NL</given-names> </name><name name-style="western"><surname>Elsaid</surname><given-names>M</given-names> </name><name name-style="western"><surname>Paskett</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bose-Brill</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bridges</surname><given-names>JFP</given-names> </name></person-group><article-title>Guidelines for artificial intelligence in medicine: literature review and content analysis of frameworks</article-title><source>J Med Internet Res</source><year>2022</year><month>08</month><day>25</day><volume>24</volume><issue>8</issue><fpage>e36823</fpage><pub-id pub-id-type="doi">10.2196/36823</pub-id><pub-id pub-id-type="medline">36006692</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ibrahim</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>SC</given-names> </name><etal/></person-group><article-title>Reporting guidelines for clinical trials of artificial intelligence interventions: the SPIRIT-AI and CONSORT-AI guidelines</article-title><source>Trials</source><year>2021</year><month>01</month><day>6</day><volume>22</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1186/s13063-020-04951-6</pub-id><pub-id pub-id-type="medline">33407780</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levac</surname><given-names>D</given-names> </name><name name-style="western"><surname>Colquhoun</surname><given-names>H</given-names> </name><name name-style="western"><surname>O&#x2019;Brien</surname><given-names>KK</given-names> </name></person-group><article-title>Scoping studies: advancing the methodology</article-title><source>Implement Sci</source><year>2010</year><month>09</month><day>20</day><volume>5</volume><issue>1</issue><fpage>69</fpage><pub-id pub-id-type="doi">10.1186/1748-5908-5-69</pub-id><pub-id pub-id-type="medline">20854677</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Aromataris</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lockwood</surname><given-names>C</given-names> </name><name name-style="western"><surname>Porritt</surname><given-names>K</given-names> </name><name name-style="western"><surname>Pilla</surname><given-names>B</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Jordan</surname><given-names>Z</given-names> </name></person-group><source>JBI Manual for Evidence Synthesis</source><year>2024</year><access-date>2024-03-02</access-date><publisher-name>JBI</publisher-name><pub-id pub-id-type="doi">10.46658/JBIMES-24-01</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tricco</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Lillie</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zarin</surname><given-names>W</given-names> </name><etal/></person-group><article-title>PRISMA Extension for Scoping Reviews (PRISMA-ScR): checklist and explanation</article-title><source>Ann Intern Med</source><year>2018</year><month>10</month><day>2</day><volume>169</volume><issue>7</issue><fpage>467</fpage><lpage>473</lpage><pub-id pub-id-type="doi">10.7326/M18-0850</pub-id><pub-id pub-id-type="medline">30178033</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rethlefsen</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Kirtley</surname><given-names>S</given-names> </name><name name-style="western"><surname>Waffenschmidt</surname><given-names>S</given-names> </name><etal/></person-group><article-title>PRISMA-S: an extension to the PRISMA Statement for Reporting Literature Searches in Systematic Reviews</article-title><source>Syst Rev</source><year>2021</year><month>01</month><day>26</day><volume>10</volume><issue>1</issue><fpage>39</fpage><pub-id pub-id-type="doi">10.1186/s13643-020-01542-z</pub-id><pub-id pub-id-type="medline">33499930</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Critical appraisal tool for artificial intelligence clinical studies. a scoping review</article-title><source>OSF</source><access-date>2025-11-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.17605/OSF.IO/ETYDS">https://doi.org/10.17605/OSF.IO/ETYDS</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>What is PROSPERO?</article-title><source>PROSPERO</source><access-date>2024-03-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.crd.york.ac.uk/PROSPERO/home">https://www.crd.york.ac.uk/PROSPERO/home</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Home</article-title><source>OSF</source><access-date>2025-11-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://osf.io/">https://osf.io/</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Home</article-title><source>Research Registry</source><access-date>2025-11-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.researchregistry.com/">https://www.researchregistry.com/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>Library for health research reporting</article-title><source>EQUATOR Network</source><access-date>2024-05-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.equator-network.org/library/">https://www.equator-network.org/library/</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pollock</surname><given-names>D</given-names> </name><name name-style="western"><surname>Peters</surname><given-names>MDJ</given-names> </name><name name-style="western"><surname>Khalil</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Recommendations for the extraction, analysis, and presentation of results in scoping reviews</article-title><source>JBI Evid Synth</source><year>2023</year><month>03</month><day>1</day><volume>21</volume><issue>3</issue><fpage>520</fpage><lpage>532</lpage><pub-id pub-id-type="doi">10.11124/JBIES-22-00123</pub-id><pub-id pub-id-type="medline">36081365</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luo</surname><given-names>W</given-names> </name><name name-style="western"><surname>Phung</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Guidelines for developing and reporting machine learning predictive models in biomedical research: a multidisciplinary view</article-title><source>J Med Internet Res</source><year>2016</year><month>12</month><day>16</day><volume>18</volume><issue>12</issue><fpage>e323</fpage><pub-id pub-id-type="doi">10.2196/jmir.5870</pub-id><pub-id pub-id-type="medline">27986644</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lambin</surname><given-names>P</given-names> </name><name name-style="western"><surname>Leijenaar</surname><given-names>RTH</given-names> </name><name name-style="western"><surname>Deist</surname><given-names>TM</given-names> </name><etal/></person-group><article-title>Radiomics: the bridge between medical imaging and personalized medicine</article-title><source>Nat Rev Clin Oncol</source><year>2017</year><month>12</month><volume>14</volume><issue>12</issue><fpage>749</fpage><lpage>762</lpage><pub-id pub-id-type="doi">10.1038/nrclinonc.2017.141</pub-id><pub-id pub-id-type="medline">28975929</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiao</surname><given-names>N</given-names> </name></person-group><article-title>A systematic review on machine learning in sellar region diseases: quality and reporting items</article-title><source>Endocr Connect</source><year>2019</year><month>07</month><volume>8</volume><issue>7</issue><fpage>952</fpage><lpage>960</lpage><pub-id pub-id-type="doi">10.1530/EC-19-0156</pub-id><pub-id pub-id-type="medline">31234143</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>PHC</given-names> </name><name name-style="western"><surname>Krause</surname><given-names>J</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>L</given-names> </name></person-group><article-title>How to read articles that use machine learning: users&#x2019; guides to the medical literature</article-title><source>JAMA</source><year>2019</year><month>11</month><day>12</day><volume>322</volume><issue>18</issue><fpage>1806</fpage><lpage>1816</lpage><pub-id pub-id-type="doi">10.1001/jama.2019.16489</pub-id><pub-id pub-id-type="medline">31714992</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vollmer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mateen</surname><given-names>BA</given-names> </name><name name-style="western"><surname>Bohner</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Machine learning and artificial intelligence research for patient benefit: 20 critical questions on transparency, replicability, ethics, and effectiveness</article-title><source>BMJ</source><year>2020</year><month>03</month><day>20</day><volume>368</volume><fpage>l6927</fpage><pub-id pub-id-type="doi">10.1136/bmj.l6927</pub-id><pub-id pub-id-type="medline">32198138</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cruz Rivera</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Guidelines for clinical trial protocols for interventions involving artificial intelligence: the SPIRIT-AI extension</article-title><source>Lancet Digit Health</source><year>2020</year><month>10</month><volume>2</volume><issue>10</issue><fpage>e549</fpage><lpage>e560</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(20)30219-3</pub-id><pub-id pub-id-type="medline">33328049</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Faes</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wagner</surname><given-names>SK</given-names> </name><etal/></person-group><article-title>A clinician&#x2019;s guide to artificial intelligence: how to critically appraise machine learning studies</article-title><source>Transl Vis Sci Technol</source><year>2020</year><month>02</month><day>12</day><volume>9</volume><issue>2</issue><fpage>7</fpage><pub-id pub-id-type="doi">10.1167/tvst.9.2.7</pub-id><pub-id pub-id-type="medline">32704413</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hernandez-Boussard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bozkurt</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ioannidis</surname><given-names>JPA</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name></person-group><article-title>MINIMAR (MINimum Information for Medical AI Reporting): developing reporting standards for artificial intelligence in health care</article-title><source>J Am Med Inform Assoc</source><year>2020</year><month>12</month><day>9</day><volume>27</volume><issue>12</issue><fpage>2011</fpage><lpage>2015</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa088</pub-id><pub-id pub-id-type="medline">32594179</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Rivera</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI Extension</article-title><source>BMJ</source><year>2020</year><month>09</month><day>9</day><volume>370</volume><fpage>m3164</fpage><pub-id pub-id-type="doi">10.1136/bmj.m3164</pub-id><pub-id pub-id-type="medline">32909959</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mongan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moy</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kahn</surname><given-names>CE</given-names> </name></person-group><article-title>Checklist for Artificial Intelligence in Medical Imaging (CLAIM): a guide for authors and reviewers</article-title><source>Radiol Artif Intell</source><year>2020</year><month>03</month><volume>2</volume><issue>2</issue><fpage>e200029</fpage><pub-id pub-id-type="doi">10.1148/ryai.2020200029</pub-id><pub-id pub-id-type="medline">33937821</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Norgeot</surname><given-names>B</given-names> </name><name name-style="western"><surname>Quer</surname><given-names>G</given-names> </name><name name-style="western"><surname>Beaulieu-Jones</surname><given-names>BK</given-names> </name><etal/></person-group><article-title>Minimum information about clinical artificial intelligence modeling: the MI-CLAIM checklist</article-title><source>Nat Med</source><year>2020</year><month>09</month><volume>26</volume><issue>9</issue><fpage>1320</fpage><lpage>1324</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-1041-y</pub-id><pub-id pub-id-type="medline">32908275</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sengupta</surname><given-names>PP</given-names> </name><name name-style="western"><surname>Shrestha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Berthon</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Proposed Requirements for Cardiovascular Imaging-Related Machine Learning Evaluation (PRIME): a checklist: reviewed by the American College of Cardiology Healthcare Innovation Council</article-title><source>JACC Cardiovasc Imaging</source><year>2020</year><month>09</month><volume>13</volume><issue>9</issue><fpage>2017</fpage><lpage>2035</lpage><pub-id pub-id-type="doi">10.1016/j.jcmg.2020.07.015</pub-id><pub-id pub-id-type="medline">32912474</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stevens</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Mortazavi</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Deo</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Curtis</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kao</surname><given-names>DP</given-names> </name></person-group><article-title>Recommendations for reporting machine learning analyses in clinical research</article-title><source>Circ Cardiovasc Qual Outcomes</source><year>2020</year><month>10</month><volume>13</volume><issue>10</issue><fpage>e006556</fpage><pub-id pub-id-type="doi">10.1161/CIRCOUTCOMES.120.006556</pub-id><pub-id pub-id-type="medline">33079589</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cabitza</surname><given-names>F</given-names> </name><name name-style="western"><surname>Campagner</surname><given-names>A</given-names> </name></person-group><article-title>The need to separate the wheat from the chaff in medical informatics: introducing a comprehensive checklist for the (self)-assessment of medical AI studies</article-title><source>Int J Med Inform</source><year>2021</year><month>09</month><volume>153</volume><fpage>104510</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2021.104510</pub-id><pub-id pub-id-type="medline">34108105</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Naqa</surname><given-names>I</given-names> </name><name name-style="western"><surname>Boone</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Benedict</surname><given-names>SH</given-names> </name><etal/></person-group><article-title>AI in medical physics: guidelines for publication</article-title><source>Med Phys</source><year>2021</year><month>09</month><volume>48</volume><issue>9</issue><fpage>4711</fpage><lpage>4714</lpage><pub-id pub-id-type="doi">10.1002/mp.15170</pub-id><pub-id pub-id-type="medline">34545957</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kwong</surname><given-names>JCC</given-names> </name><name name-style="western"><surname>McLoughlin</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Haider</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Standardized Reporting of Machine Learning Applications in Urology: the STREAM-URO framework</article-title><source>Eur Urol Focus</source><year>2021</year><month>07</month><volume>7</volume><issue>4</issue><fpage>672</fpage><lpage>682</lpage><pub-id pub-id-type="doi">10.1016/j.euf.2021.07.004</pub-id><pub-id pub-id-type="medline">34362709</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meshaka</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pinto Dos Santos</surname><given-names>D</given-names> </name><name name-style="western"><surname>Arthurs</surname><given-names>OJ</given-names> </name><name name-style="western"><surname>Sebire</surname><given-names>NJ</given-names> </name><name name-style="western"><surname>Shelmerdine</surname><given-names>SC</given-names> </name></person-group><article-title>Artificial intelligence reporting guidelines: what the pediatric radiologist needs to know</article-title><source>Pediatr Radiol</source><year>2022</year><month>10</month><volume>52</volume><issue>11</issue><fpage>2101</fpage><lpage>2110</lpage><pub-id pub-id-type="doi">10.1007/s00247-021-05129-1</pub-id><pub-id pub-id-type="medline">34196729</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Olczak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pavlopoulos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Prijs</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Presenting artificial intelligence, deep learning, and machine learning studies to clinicians and healthcare stakeholders: an introductory reference with a guideline and a Clinical AI Research (CAIR) checklist proposal</article-title><source>Acta Orthop</source><year>2021</year><month>10</month><volume>92</volume><issue>5</issue><fpage>513</fpage><lpage>525</lpage><pub-id pub-id-type="doi">10.1080/17453674.2021.1918389</pub-id><pub-id pub-id-type="medline">33988081</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwendicke</surname><given-names>F</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>Artificial intelligence in dental research: checklist for authors, reviewers, readers</article-title><source>J Dent (Shiraz)</source><year>2021</year><month>04</month><volume>107</volume><fpage>103610</fpage><pub-id pub-id-type="doi">10.1016/j.jdent.2021.103610</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sounderajah</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ashrafian</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rose</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A quality assessment tool for artificial intelligence-centered diagnostic test accuracy studies: QUADAS-AI</article-title><source>Nat Med</source><year>2021</year><month>10</month><volume>27</volume><issue>10</issue><fpage>1663</fpage><lpage>1665</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01517-0</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sounderajah</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ashrafian</surname><given-names>H</given-names> </name><name name-style="western"><surname>Golub</surname><given-names>RM</given-names> </name><etal/></person-group><article-title>Developing a reporting guideline for artificial intelligence-centred diagnostic test accuracy studies: the STARD-AI protocol</article-title><source>BMJ Open</source><year>2021</year><month>06</month><volume>11</volume><issue>6</issue><fpage>e047709</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-047709</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vinny</surname><given-names>PW</given-names> </name><name name-style="western"><surname>Garg</surname><given-names>R</given-names> </name><name name-style="western"><surname>Padma Srivastava</surname><given-names>MV</given-names> </name><name name-style="western"><surname>Lal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Vishnu</surname><given-names>VY</given-names> </name></person-group><article-title>Critical appraisal of a machine learning paper: a guide for the neurologist</article-title><source>Ann Indian Acad Neurol</source><year>2021</year><volume>24</volume><issue>4</issue><fpage>481</fpage><lpage>489</lpage><pub-id pub-id-type="doi">10.4103/aian.AIAN_1120_20</pub-id><pub-id pub-id-type="medline">34728938</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Andaur Navarro</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Protocol for development of a reporting guideline (TRIPOD-AI) and risk of bias tool (PROBAST-AI) for diagnostic and prognostic prediction model studies based on artificial intelligence</article-title><source>BMJ Open</source><year>2021</year><month>07</month><day>9</day><volume>11</volume><issue>7</issue><fpage>e048008</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-048008</pub-id><pub-id pub-id-type="medline">34244270</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Al-Zaiti</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Alghwiri</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>A clinician&#x2019;s guide to understanding and critically appraising machine learning studies: a checklist for Ruling Out Bias Using Standard Tools in Machine Learning (ROBUST-ML)</article-title><source>Eur Heart J Digit Health</source><year>2022</year><month>06</month><volume>3</volume><issue>2</issue><fpage>125</fpage><lpage>140</lpage><pub-id pub-id-type="doi">10.1093/ehjdh/ztac016</pub-id><pub-id pub-id-type="medline">36713011</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Barata</surname><given-names>C</given-names> </name><name name-style="western"><surname>Betz-Stablein</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Checklist for Evaluation of Image-Based Artificial Intelligence Reports in Dermatology: CLEAR Derm Consensus Guidelines From the International Skin Imaging Collaboration Artificial Intelligence Working Group</article-title><source>JAMA Dermatol</source><year>2022</year><month>01</month><day>1</day><volume>158</volume><issue>1</issue><fpage>90</fpage><lpage>96</lpage><pub-id pub-id-type="doi">10.1001/jamadermatol.2021.4915</pub-id><pub-id pub-id-type="medline">34851366</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Van Cauter</surname><given-names>S</given-names> </name><name name-style="western"><surname>Federau</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hedderich</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Edjlali</surname><given-names>M</given-names> </name></person-group><article-title>The R-AI-DIOLOGY checklist: a practical checklist for evaluation of artificial intelligence tools in clinical neuroradiology</article-title><source>Neuroradiology</source><year>2022</year><month>05</month><volume>64</volume><issue>5</issue><fpage>851</fpage><lpage>864</lpage><pub-id pub-id-type="doi">10.1007/s00234-021-02890-w</pub-id><pub-id pub-id-type="medline">35098343</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jha</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Bradshaw</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Buvat</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Nuclear medicine and artificial intelligence: best practices for evaluation (the RELAINCE Guidelines)</article-title><source>J Nucl Med</source><year>2022</year><month>09</month><volume>63</volume><issue>9</issue><fpage>1288</fpage><lpage>1299</lpage><pub-id pub-id-type="doi">10.2967/jnumed.121.263239</pub-id><pub-id pub-id-type="medline">35618476</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Padula</surname><given-names>WV</given-names> </name><name name-style="western"><surname>Kreif</surname><given-names>N</given-names> </name><name name-style="western"><surname>Vanness</surname><given-names>DJ</given-names> </name><etal/></person-group><article-title>Machine learning methods in health economics and outcomes research-the PALISADE checklist: a good practices report of an ISPOR task force</article-title><source>Value Health</source><year>2022</year><month>07</month><volume>25</volume><issue>7</issue><fpage>1063</fpage><lpage>1080</lpage><pub-id pub-id-type="doi">10.1016/j.jval.2022.03.022</pub-id><pub-id pub-id-type="medline">35779937</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><name name-style="western"><surname>Heinze</surname><given-names>G</given-names> </name><name name-style="western"><surname>Van Calster</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Critical appraisal of artificial intelligence-based prediction models for cardiovascular disease</article-title><source>Eur Heart J</source><year>2022</year><month>08</month><day>14</day><volume>43</volume><issue>31</issue><fpage>2921</fpage><lpage>2930</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehac238</pub-id><pub-id pub-id-type="medline">35639667</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vasey</surname><given-names>B</given-names> </name><name name-style="western"><surname>Nagendran</surname><given-names>M</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Reporting guideline for the early-stage clinical evaluation of decision support systems driven by artificial intelligence: DECIDE-AI</article-title><source>Nat Med</source><year>2022</year><month>05</month><volume>28</volume><issue>5</issue><fpage>924</fpage><lpage>933</lpage><pub-id pub-id-type="doi">10.1038/s41591-022-01772-9</pub-id><pub-id pub-id-type="medline">35585198</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>OT</given-names> </name><name name-style="western"><surname>Matin</surname><given-names>RN</given-names> </name><name name-style="western"><surname>van der Schaar</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Artificial intelligence and machine learning algorithms for early detection of skin cancer in community and primary care settings: a systematic review</article-title><source>Lancet Digit Health</source><year>2022</year><month>06</month><volume>4</volume><issue>6</issue><fpage>e466</fpage><lpage>e476</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(22)00023-1</pub-id><pub-id pub-id-type="medline">35623799</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Cabello</surname><given-names>J</given-names> </name></person-group><article-title>Lectura cr&#x00ED;tica de estudios cl&#x00ED;nicos in virtuo/in silico: modelos-simulaciones, inteligencia artificial y big data</article-title><source>Lect Cr&#x00ED;tica Evid Cl&#x00ED;nica</source><year>2022</year><access-date>2025-11-24</access-date><edition>2</edition><publisher-name>Elsevier</publisher-name><fpage>253</fpage><lpage>272</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.clinicalkey.es/#!/content/book/3-s2.0-B9788491138839000204">https://www.clinicalkey.es/#!/content/book/3-s2.0-B9788491138839000204</ext-link></comment><pub-id pub-id-type="other">978-84-9113-883-9</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cacciamani</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Sanford</surname><given-names>DI</given-names> </name><etal/></person-group><article-title>PRISMA AI reporting guidelines for systematic reviews and meta-analyses on AI in healthcare</article-title><source>Nat Med</source><year>2023</year><month>01</month><volume>29</volume><issue>1</issue><fpage>14</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.1038/s41591-022-02139-w</pub-id><pub-id pub-id-type="medline">36646804</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cerd&#x00E1;-Alberich</surname><given-names>L</given-names> </name><name name-style="western"><surname>Solana</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mallol</surname><given-names>P</given-names> </name><etal/></person-group><article-title>MAIC-10 brief quality checklist for publications using artificial intelligence and medical images</article-title><source>Insights Imaging</source><year>2023</year><month>01</month><day>16</day><volume>14</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1186/s13244-022-01355-9</pub-id><pub-id pub-id-type="medline">36645542</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kocak</surname><given-names>B</given-names> </name><name name-style="western"><surname>Baessler</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bakas</surname><given-names>S</given-names> </name><etal/></person-group><article-title>CheckList for EvaluAtion of Radiomics research (CLEAR): a step-by-step reporting guideline for authors and reviewers endorsed by ESR and EuSoMII</article-title><source>Insights Imaging</source><year>2023</year><month>05</month><day>4</day><volume>14</volume><issue>1</issue><fpage>75</fpage><pub-id pub-id-type="doi">10.1186/s13244-023-01415-8</pub-id><pub-id pub-id-type="medline">37142815</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kwong</surname><given-names>JCC</given-names> </name><name name-style="western"><surname>Khondker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lajkosz</surname><given-names>K</given-names> </name><etal/></person-group><article-title>APPRAISE-AI tool for quantitative evaluation of AI studies for clinical decision support</article-title><source>JAMA Netw Open</source><year>2023</year><month>09</month><day>5</day><volume>6</volume><issue>9</issue><fpage>e2335377</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.35377</pub-id><pub-id pub-id-type="medline">37747733</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Sul</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jang</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JG</given-names> </name></person-group><article-title>Radiologist&#x2019;s guide to evaluating publications of clinical research on AI: how we do it</article-title><source>Radiology</source><year>2023</year><month>09</month><volume>308</volume><issue>3</issue><fpage>e230288</fpage><pub-id pub-id-type="doi">10.1148/radiol.230288</pub-id><pub-id pub-id-type="medline">37750772</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="web"><article-title>Good machine learning practice for medical device development: guiding principles</article-title><source>FDA</source><access-date>2025-11-05</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/medical-devices/software-medical-device-samd/good-machine-learning-practice-medical-device-development-guiding-principles">https://www.fda.gov/medical-devices/software-medical-device-samd/good-machine-learning-practice-medical-device-development-guiding-principles</ext-link></comment></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><etal/></person-group><article-title>TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title><source>BMJ</source><year>2024</year><month>04</month><day>16</day><volume>385</volume><fpage>e078378</fpage><pub-id pub-id-type="doi">10.1136/bmj-2023-078378</pub-id><pub-id pub-id-type="medline">38626948</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>du Toit</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>TQB</given-names> </name><name name-style="western"><surname>Deo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Survey and evaluation of hypertension machine learning research</article-title><source>J Am Heart Assoc</source><year>2023</year><month>05</month><day>2</day><volume>12</volume><issue>9</issue><fpage>e027896</fpage><pub-id pub-id-type="doi">10.1161/JAHA.122.027896</pub-id><pub-id pub-id-type="medline">37119074</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cote</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Lubowitz</surname><given-names>JH</given-names> </name></person-group><article-title>Recommended requirements and essential elements for proper reporting of the use of artificial intelligence machine learning tools in biomedical research and scientific publications</article-title><source>Arthroscopy</source><year>2024</year><month>04</month><volume>40</volume><issue>4</issue><fpage>1033</fpage><lpage>1038</lpage><pub-id pub-id-type="doi">10.1016/j.arthro.2023.12.027</pub-id><pub-id pub-id-type="medline">38300189</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kocak</surname><given-names>B</given-names> </name><name name-style="western"><surname>Akinci D&#x2019;Antonoli</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mercaldo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>METhodological RadiomICs Score (METRICS): a quality scoring tool for radiomics research endorsed by EuSoMII</article-title><source>Insights Imaging</source><year>2024</year><month>01</month><day>17</day><volume>15</volume><issue>1</issue><fpage>8</fpage><pub-id pub-id-type="doi">10.1186/s13244-023-01572-w</pub-id><pub-id pub-id-type="medline">38228979</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lekadir</surname><given-names>K</given-names> </name><name name-style="western"><surname>Osuala</surname><given-names>R</given-names> </name><name name-style="western"><surname>Gallin</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Guiding principles and consensus recommendations for trustworthy artificial intelligence in medical imaging</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 20, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2109.09658</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scott</surname><given-names>I</given-names> </name><name name-style="western"><surname>Carter</surname><given-names>S</given-names> </name><name name-style="western"><surname>Coiera</surname><given-names>E</given-names> </name></person-group><article-title>Clinician checklist for assessing suitability of machine learning applications in healthcare</article-title><source>BMJ Health Care Inform</source><year>2021</year><month>02</month><volume>28</volume><issue>1</issue><fpage>e100251</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2020-100251</pub-id><pub-id pub-id-type="medline">33547086</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vaira</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Lechien</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Abbate</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Validation of the Quality Analysis of Medical Artificial Intelligence (QAMAI) tool: a new tool to assess the quality of health information provided by AI platforms</article-title><source>Eur Arch Otorhinolaryngol</source><year>2024</year><month>11</month><volume>281</volume><issue>11</issue><fpage>6123</fpage><lpage>6131</lpage><pub-id pub-id-type="doi">10.1007/s00405-024-08710-0</pub-id><pub-id pub-id-type="medline">38703195</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>CHART Collaborative</collab></person-group><article-title>Protocol for the development of the Chatbot Assessment Reporting Tool (CHART) for clinical advice</article-title><source>BMJ Open</source><year>2024</year><month>05</month><day>21</day><volume>14</volume><issue>5</issue><fpage>e081155</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2023-081155</pub-id><pub-id pub-id-type="medline">38772889</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brault</surname><given-names>N</given-names> </name><name name-style="western"><surname>Saxena</surname><given-names>M</given-names> </name></person-group><article-title>For a critical appraisal of artificial intelligence in healthcare: the problem of bias in mHealth</article-title><source>J Eval Clin Pract</source><year>2021</year><month>06</month><volume>27</volume><issue>3</issue><fpage>513</fpage><lpage>519</lpage><pub-id pub-id-type="doi">10.1111/jep.13528</pub-id><pub-id pub-id-type="medline">33369050</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fletcher</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Nakeshimana</surname><given-names>A</given-names> </name><name name-style="western"><surname>Olubeko</surname><given-names>O</given-names> </name></person-group><article-title>Addressing fairness, bias, and appropriate use of artificial intelligence and machine learning in global health</article-title><source>Front Artif Intell</source><year>2020</year><volume>3</volume><fpage>561802</fpage><pub-id pub-id-type="doi">10.3389/frai.2020.561802</pub-id><pub-id pub-id-type="medline">33981989</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrabi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Morstatter</surname><given-names>F</given-names> </name><name name-style="western"><surname>Saxena</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lerman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Galstyan</surname><given-names>A</given-names> </name></person-group><article-title>A survey on bias and fairness in machine learning</article-title><source>ACM Comput Surv</source><year>2022</year><month>07</month><day>31</day><volume>54</volume><issue>6</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3457607</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Schwartz</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vassilev</surname><given-names>A</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>K</given-names> </name><name name-style="western"><surname>Perine</surname><given-names>L</given-names> </name><name name-style="western"><surname>Burt</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hall</surname><given-names>P</given-names> </name></person-group><source>Towards a Standard for Identifying and Managing Bias in Artificial Intelligence</source><year>2022</year><access-date>2025-11-11</access-date><publisher-name>National Institute of Standards and Technology (U.S)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.1270.pdf">https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.1270.pdf</ext-link></comment></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>WH</given-names> </name><etal/></person-group><article-title>Algorithmic fairness in computational medicine</article-title><source>EBioMedicine</source><year>2022</year><month>10</month><volume>84</volume><fpage>104250</fpage><pub-id pub-id-type="doi">10.1016/j.ebiom.2022.104250</pub-id><pub-id pub-id-type="medline">36084616</pub-id></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Saint James Aquino</surname><given-names>Y</given-names> </name></person-group><article-title>Making decisions: bias in artificial intelligence and data&#x2011;driven diagnostic tools</article-title><source>Aust J Gen Pract</source><year>2023</year><month>07</month><volume>52</volume><issue>7</issue><fpage>439</fpage><lpage>442</lpage><pub-id pub-id-type="doi">10.31128/AJGP-12-22-6630</pub-id><pub-id pub-id-type="medline">37423238</pub-id></nlm-citation></ref><ref id="ref74"><label>74</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name></person-group><article-title>Bias in artificial intelligence: basic primer</article-title><source>Clin J Am Soc Nephrol</source><year>2023</year><month>03</month><day>1</day><volume>18</volume><issue>3</issue><fpage>394</fpage><lpage>396</lpage><pub-id pub-id-type="doi">10.2215/CJN.0000000000000078</pub-id><pub-id pub-id-type="medline">36723176</pub-id></nlm-citation></ref><ref id="ref75"><label>75</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Perez-Downes</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Tseng</surname><given-names>AS</given-names> </name><name name-style="western"><surname>McConn</surname><given-names>KA</given-names> </name><etal/></person-group><article-title>Mitigating bias in clinical machine learning models</article-title><source>Curr Treat Options Cardio Med</source><year>2024</year><month>03</month><volume>26</volume><issue>3</issue><fpage>29</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1007/s11936-023-01032-0</pub-id></nlm-citation></ref><ref id="ref76"><label>76</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Flores</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><name name-style="western"><surname>Young</surname><given-names>SD</given-names> </name></person-group><article-title>Addressing bias in artificial intelligence for public health surveillance</article-title><source>J Med Ethics</source><year>2024</year><month>02</month><day>20</day><volume>50</volume><issue>3</issue><fpage>190</fpage><lpage>194</lpage><pub-id pub-id-type="doi">10.1136/jme-2022-108875</pub-id><pub-id pub-id-type="medline">37130756</pub-id></nlm-citation></ref><ref id="ref77"><label>77</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yeo</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Samaan</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>WH</given-names> </name><etal/></person-group><article-title>Assessing the performance of ChatGPT in answering questions regarding cirrhosis and hepatocellular carcinoma</article-title><source>Clin Mol Hepatol</source><year>2023</year><month>07</month><volume>29</volume><issue>3</issue><fpage>721</fpage><lpage>732</lpage><pub-id pub-id-type="doi">10.3350/cmh.2023.0089</pub-id><pub-id pub-id-type="medline">36946005</pub-id></nlm-citation></ref><ref id="ref78"><label>78</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Goodman</surname><given-names>R</given-names> </name><name name-style="western"><surname>Patrinely</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the accuracy and reliability of AI-generated medical responses: an evaluation of the chat-GPT model</article-title><source>Res Sq</source><comment>Preprint posted online on  Feb 28, 2023</comment><pub-id pub-id-type="doi">10.21203/rs.3.rs-2566942/v1</pub-id><pub-id pub-id-type="medline">36909565</pub-id></nlm-citation></ref><ref id="ref79"><label>79</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bunning</surname><given-names>B</given-names> </name><name name-style="western"><surname>Khoong</surname><given-names>E</given-names> </name><etal/></person-group><article-title>ChatGPT influence on medical decision-making, bias, and equity: a randomized study of clinicians evaluating clinical vignettes</article-title><source>medRxiv</source><comment>Preprint posted online on  Nov 27, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.11.24.23298844</pub-id><pub-id pub-id-type="medline">38076944</pub-id></nlm-citation></ref><ref id="ref80"><label>80</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanna</surname><given-names>DR</given-names> </name><name name-style="western"><surname>Ito</surname><given-names>W</given-names> </name><name name-style="western"><surname>Terry</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Molina</surname><given-names>WR</given-names> </name><name name-style="western"><surname>Whiles</surname><given-names>BB</given-names> </name></person-group><article-title>Utilization of bring AI chatbot for stone management questions: a comparison of chat response modes and the AUA guidelines</article-title><source>J Endourol</source><year>2023</year><volume>37</volume><fpage>A306</fpage><lpage>A307</lpage><pub-id pub-id-type="doi">10.1089/end.2023.36001.abstracts</pub-id></nlm-citation></ref><ref id="ref81"><label>81</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zakka</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chaurasia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title><source>NEJM AI</source><year>2024</year><month>02</month><volume>1</volume><issue>2</issue><pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id><pub-id pub-id-type="medline">38343631</pub-id></nlm-citation></ref><ref id="ref82"><label>82</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>McKechnie</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ortenzi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Dr. GPT will see you now: the ability of large language model-linked chatbots to provide colorectal cancer screening recommendations</article-title><source>Health Technol</source><year>2024</year><month>05</month><volume>14</volume><issue>3</issue><fpage>463</fpage><lpage>469</lpage><pub-id pub-id-type="doi">10.1007/s12553-024-00836-9</pub-id></nlm-citation></ref><ref id="ref83"><label>83</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Geoghegan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Scarborough</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wormald</surname><given-names>JCR</given-names> </name><etal/></person-group><article-title>Automated conversational agents for post-intervention follow-up: a systematic review</article-title><source>BJS Open</source><year>2021</year><month>07</month><day>6</day><volume>5</volume><issue>4</issue><fpage>zrab070</fpage><pub-id pub-id-type="doi">10.1093/bjsopen/zrab070</pub-id><pub-id pub-id-type="medline">34323916</pub-id></nlm-citation></ref><ref id="ref84"><label>84</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>YJ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Fukuoka</surname><given-names>Y</given-names> </name></person-group><article-title>A systematic review of artificial intelligence chatbots for promoting physical activity, healthy diet, and weight loss</article-title><source>Int J Behav Nutr Phys Act</source><year>2021</year><month>12</month><day>11</day><volume>18</volume><issue>1</issue><fpage>160</fpage><pub-id pub-id-type="doi">10.1186/s12966-021-01224-6</pub-id><pub-id pub-id-type="medline">34895247</pub-id></nlm-citation></ref><ref id="ref85"><label>85</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ogilvie</surname><given-names>L</given-names> </name><name name-style="western"><surname>Prescott</surname><given-names>J</given-names> </name><name name-style="western"><surname>Carson</surname><given-names>J</given-names> </name></person-group><article-title>The use of chatbots as supportive agents for people seeking help with substance use disorder: a systematic review</article-title><source>Eur Addict Res</source><year>2022</year><volume>28</volume><issue>6</issue><fpage>405</fpage><lpage>418</lpage><pub-id pub-id-type="doi">10.1159/000525959</pub-id><pub-id pub-id-type="medline">36041418</pub-id></nlm-citation></ref><ref id="ref86"><label>86</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Aggarwal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tam</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Qiao</surname><given-names>S</given-names> </name></person-group><article-title>Artificial intelligence (ai)-based chatbots in promoting health behavioral changes: a systematic review</article-title><source>medRxiv</source><comment>Preprint posted online on 2022</comment><pub-id pub-id-type="doi">10.1101/2022.07.05.22277263</pub-id></nlm-citation></ref><ref id="ref87"><label>87</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Webster</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Ahsan</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Chatbot artificial intelligence for genetic cancer risk assessment and counseling: a systematic review and meta-analysis</article-title><source>JCO Clin Cancer Inform</source><year>2023</year><month>09</month><volume>7</volume><issue>7</issue><fpage>e2300123</fpage><pub-id pub-id-type="doi">10.1200/CCI.23.00123</pub-id><pub-id pub-id-type="medline">37934933</pub-id></nlm-citation></ref><ref id="ref88"><label>88</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bendotti</surname><given-names>H</given-names> </name><name name-style="western"><surname>Lawler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>GCK</given-names> </name><name name-style="western"><surname>Gartner</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ireland</surname><given-names>D</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>HM</given-names> </name></person-group><article-title>Conversational artificial intelligence interventions to support smoking cessation: a systematic review and meta-analysis</article-title><source>Digit Health</source><year>2023</year><volume>9</volume><fpage>20552076231211634</fpage><pub-id pub-id-type="doi">10.1177/20552076231211634</pub-id><pub-id pub-id-type="medline">37928336</pub-id></nlm-citation></ref><ref id="ref89"><label>89</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Olds</surname><given-names>T</given-names> </name><name name-style="western"><surname>Brinsley</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Systematic review and meta-analysis of the effectiveness of chatbots on lifestyle behaviours</article-title><source>NPJ Digit Med</source><year>2023</year><month>06</month><day>23</day><volume>6</volume><issue>1</issue><fpage>118</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00856-1</pub-id><pub-id pub-id-type="medline">37353578</pub-id></nlm-citation></ref><ref id="ref90"><label>90</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Noh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Won</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hahm</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name></person-group><article-title>Conversational agents for body weight management: systematic review</article-title><source>J Med Internet Res</source><year>2023</year><month>05</month><day>26</day><volume>25</volume><fpage>e42238</fpage><pub-id pub-id-type="doi">10.2196/42238</pub-id><pub-id pub-id-type="medline">37234029</pub-id></nlm-citation></ref><ref id="ref91"><label>91</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>HK</given-names> </name></person-group><article-title>The effects of artificial intelligence chatbots on women&#x2019;s health: a systematic review and meta-analysis</article-title><source>Healthcare (Basel)</source><year>2024</year><month>02</month><day>23</day><volume>12</volume><issue>5</issue><fpage>534</fpage><pub-id pub-id-type="doi">10.3390/healthcare12050534</pub-id><pub-id pub-id-type="medline">38470645</pub-id></nlm-citation></ref><ref id="ref92"><label>92</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cabello</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Ruiz Garc&#x00ED;a</surname><given-names>V</given-names> </name><name name-style="western"><surname>Torralba</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Data avaliability for &#x201C;critical appraisal tools for artificial intelligence clinical studies. a scoping review&#x201D; (preprint)</article-title><source>JMIR Data</source><comment>Preprint posted online on  Oct 13, 2025</comment><pub-id pub-id-type="doi">10.2196/preprints.85688</pub-id></nlm-citation></ref><ref id="ref93"><label>93</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Charnock</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shepperd</surname><given-names>S</given-names> </name><name name-style="western"><surname>Needham</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gann</surname><given-names>R</given-names> </name></person-group><article-title>DISCERN: an instrument for judging the quality of written consumer health information on treatment choices</article-title><source>J Epidemiol Community Health</source><year>1999</year><month>02</month><volume>53</volume><issue>2</issue><fpage>105</fpage><lpage>111</lpage><pub-id pub-id-type="doi">10.1136/jech.53.2.105</pub-id><pub-id pub-id-type="medline">10396471</pub-id></nlm-citation></ref><ref id="ref94"><label>94</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dechartres</surname><given-names>A</given-names> </name><name name-style="western"><surname>Trinquart</surname><given-names>L</given-names> </name><name name-style="western"><surname>Faber</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ravaud</surname><given-names>P</given-names> </name></person-group><article-title>Empirical evaluation of which trial characteristics are associated with treatment effect estimates</article-title><source>J Clin Epidemiol</source><year>2016</year><month>09</month><volume>77</volume><fpage>24</fpage><lpage>37</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2016.04.005</pub-id><pub-id pub-id-type="medline">27140444</pub-id></nlm-citation></ref><ref id="ref95"><label>95</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dwan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Clarke</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evidence for the selective reporting of analyses and discrepancies in clinical trials: a systematic review of cohort studies of clinical trials</article-title><source>PLoS Med</source><year>2014</year><month>06</month><volume>11</volume><issue>6</issue><fpage>e1001666</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1001666</pub-id><pub-id pub-id-type="medline">24959719</pub-id></nlm-citation></ref><ref id="ref96"><label>96</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simera</surname><given-names>I</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hirst</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hoey</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schulz</surname><given-names>KF</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name></person-group><article-title>Transparent and accurate reporting increases reliability, utility, and impact of your research: reporting guidelines and the EQUATOR Network</article-title><source>BMC Med</source><year>2010</year><month>04</month><day>26</day><volume>8</volume><issue>1</issue><fpage>24</fpage><pub-id pub-id-type="doi">10.1186/1741-7015-8-24</pub-id><pub-id pub-id-type="medline">20420659</pub-id></nlm-citation></ref><ref id="ref97"><label>97</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name></person-group><article-title>Reporting guidelines: doing better for readers</article-title><source>BMC Med</source><year>2018</year><month>12</month><day>14</day><volume>16</volume><issue>1</issue><fpage>233</fpage><pub-id pub-id-type="doi">10.1186/s12916-018-1226-0</pub-id><pub-id pub-id-type="medline">30545364</pub-id></nlm-citation></ref><ref id="ref98"><label>98</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Furuya-Kanamori</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hasan</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Doi</surname><given-names>SA</given-names> </name></person-group><article-title>Quality versus risk-of-bias assessment in clinical research</article-title><source>J Clin Epidemiol</source><year>2021</year><month>01</month><volume>129</volume><fpage>172</fpage><lpage>175</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.09.044</pub-id><pub-id pub-id-type="medline">33422267</pub-id></nlm-citation></ref><ref id="ref99"><label>99</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marfo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models for chatbot health advice studies: a systematic review</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2457879</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.57879</pub-id><pub-id pub-id-type="medline">39903463</pub-id></nlm-citation></ref><ref id="ref100"><label>100</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Veldhuizen</surname><given-names>GP</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Truhn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kather</surname><given-names>JN</given-names> </name></person-group><article-title>Reporting guidelines in medical artificial intelligence: a systematic review and meta-analysis</article-title><source>Commun Med (Lond)</source><year>2024</year><month>04</month><day>11</day><volume>4</volume><issue>1</issue><fpage>71</fpage><pub-id pub-id-type="doi">10.1038/s43856-024-00492-0</pub-id><pub-id pub-id-type="medline">38605106</pub-id></nlm-citation></ref><ref id="ref101"><label>101</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lekadir</surname><given-names>K</given-names> </name><name name-style="western"><surname>Frangi</surname><given-names>AF</given-names> </name><name name-style="western"><surname>Porras</surname><given-names>AR</given-names> </name><etal/></person-group><article-title>FUTURE-AI: international consensus guideline for trustworthy and deployable artificial intelligence in healthcare</article-title><source>BMJ</source><year>2025</year><month>02</month><day>5</day><volume>388</volume><fpage>e081554</fpage><pub-id pub-id-type="doi">10.1136/bmj-2024-081554</pub-id><pub-id pub-id-type="medline">39909534</pub-id></nlm-citation></ref><ref id="ref102"><label>102</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Turing</surname><given-names>AM</given-names> </name></person-group><article-title>Computing machinery and intelligence</article-title><source>Mind</source><year>1950</year><month>10</month><day>1</day><volume>LIX</volume><issue>236</issue><fpage>433</fpage><lpage>460</lpage><pub-id pub-id-type="doi">10.1093/mind/LIX.236.433</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Data extraction template.</p><media xlink:href="jmir_v27i1e77110_app1.docx" xlink:title="DOCX File, 25 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>GPT&#x2013;Retrieval Augmented Generation (RAG; prompting) template.</p><media xlink:href="jmir_v27i1e77110_app2.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Exclusions after full-text screening.</p><media xlink:href="jmir_v27i1e77110_app3.docx" xlink:title="DOCX File, 40 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Footnotes for figures and tables.</p><media xlink:href="jmir_v27i1e77110_app4.docx" xlink:title="DOCX File, 23 KB"/></supplementary-material><supplementary-material id="app5"><label>Checklist 1</label><p>PRISMA (Preferred Reporting Items for Systematic reviews and Meta-Analyses) checklist search tools.</p><media xlink:href="jmir_v27i1e77110_app5.docx" xlink:title="DOCX File, 29 KB"/></supplementary-material><supplementary-material id="app6"><label>Checklist 2</label><p>PRISMA-SCR (Preferred Reporting Items for Systematic reviews and Meta-Analyses extension for Scoping Reviews) checklist.</p><media xlink:href="jmir_v27i1e77110_app6.docx" xlink:title="DOCX File, 109 KB"/></supplementary-material></app-group></back></article>