<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e87057</article-id><article-id pub-id-type="doi">10.2196/87057</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>The Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB) for the Evaluation of Medical Large Language Model Question-Answer Studies: Development and Pilot Validation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Ye</surname><given-names>Carrie</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mitchell</surname><given-names>Joseph Ross</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Baumgart</surname><given-names>Daniel C</given-names></name><degrees>MBA, MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ma</surname><given-names>Zechen</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fung</surname><given-names>Angela Lim</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Orellana</surname><given-names>Daniela Garcia</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chowdhury</surname><given-names>Juel</given-names></name><degrees>MBBS, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Abass</surname><given-names>Abdullah</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Katz</surname><given-names>Steven</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jaremko</surname><given-names>Jacob L</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Boulanger</surname><given-names>Pierre</given-names></name><degrees>PEng, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Barber</surname><given-names>Claire E H</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lemermeyer</surname><given-names>Gillian</given-names></name><degrees>RN, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jabbari</surname><given-names>Hosna</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mou</surname><given-names>Lili</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mirzaei</surname><given-names>Maryam</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Githumbi</surname><given-names>Mary Waithera Beckett</given-names></name><degrees>MBA</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tandon</surname><given-names>Puneeta</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Goebel</surname><given-names>Randy</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Clark</surname><given-names>Rhys</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hung</surname><given-names>Whitney</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Abbasi</surname><given-names>Marjan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maleki</surname><given-names>Farhad</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Klarenbach</surname><given-names>Scott</given-names></name><degrees>MSc, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Abdalla</surname><given-names>Mohamed</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>University of Alberta</institution><addr-line>8-130 Clinical Sciences Building, 11350 83 Ave NW</addr-line><addr-line>Edmonton</addr-line><addr-line>AB</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Arthritis Research Canada</institution><addr-line>Vancouver</addr-line><addr-line>BC</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Alberta Health Services</institution><addr-line>Edmonton</addr-line><addr-line>AB</addr-line><country>Canada</country></aff><aff id="aff4"><institution>Alberta Machine Intelligence Institute</institution><addr-line>Edmonton</addr-line><addr-line>AB</addr-line><country>Canada</country></aff><aff id="aff5"><institution>University of Toronto</institution><addr-line>Toronto</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff6"><institution>University of Oxford</institution><addr-line>Oxford</addr-line><country>United Kingdom</country></aff><aff id="aff7"><institution>Queen's University</institution><addr-line>Kingston</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff8"><institution>University of Calgary</institution><addr-line>Calgary</addr-line><addr-line>AB</addr-line><country>Canada</country></aff><aff id="aff9"><institution>NAIT Applied Research</institution><addr-line>Edmonton</addr-line><addr-line>AB</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Njoku</surname><given-names>Amarachi</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Surianarayanan</surname><given-names>Gayathri</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Michelena</surname><given-names>Xabier</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Carrie Ye, MPH, MD, University of Alberta, 8-130 Clinical Sciences Building, 11350 83 Ave NW, Edmonton, AB, T6G2G3, Canada, 1 7804927002, 1 7804926088; <email>cye@ualberta.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>8</day><month>4</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e87057</elocation-id><history><date date-type="received"><day>04</day><month>11</month><year>2025</year></date><date date-type="accepted"><day>24</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Carrie Ye, Joseph Ross Mitchell, Daniel C Baumgart, Zechen Ma, Angela Lim Fung, Daniela Garcia Orellana, Juel Chowdhury, Abdullah Abass, Steven Katz, Jacob L Jaremko, Pierre Boulanger, Claire E H Barber, Gillian Lemermeyer, Hosna Jabbari, Lili Mou, Maryam Mirzaei, Mary Waithera Beckett Githumbi, Puneeta Tandon, Randy Goebel, Rhys Clark, Whitney Hung, Marjan Abbasi, Farhad Maleki, Scott Klarenbach, Mohamed Abdalla. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 8.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e87057"/><abstract><sec><title>Background</title><p>Despite the transformative potential of large language models (LLMs) in health care, the rapid development of these tools has outpaced their rigorous evaluation. While artificial intelligence&#x2013;specific reporting guidelines have been developed to address standardized reporting of artificial intelligence studies, there is currently no specific tool available for risk of bias assessment of LLM question-answer (QA) studies. Existing risk-of-bias tools for medical research are not well suited to the unique challenges of evaluating LLM-QA studies, which creates a critical gap in assessing their safety and effectiveness.</p></sec><sec><title>Objective</title><p>This study aims to develop the Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB) for LLM-QA studies to systematically evaluate the validity and risk of bias in LLM-QA studies.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted 2 literature reviews. The first was on quality assessment tools for LLM-QA studies, and the second was on LLM-QA studies, which informed the first draft of the AQAT:RoB. The draft AQAT:ROB was further refined through a prespecified iterative process of modified Delphi, consensus meeting, and validation. The first Delphi process occurred between May 1 and May 20, 2025, and the first consensus meeting was held on May 22. The first round of validation was completed by 4 evaluators, who were not part of the consensus meeting, on 16 randomly selected studies. As this first round of validation surpassed our a priori threshold of &#x2265;80% agreement and a Cohen &#x03BA; of &#x2265;0.61 between evaluators, no further rounds of development and validation were undertaken. A second Delphi process occurred between February 20 and February 23, 2026, to vote on postpilot changes in response to peer review.</p></sec><sec sec-type="results"><title>Results</title><p>The AQAT:RoB consists of 5 high-level domains (Questions, Reference Answers, LLM Answers, Evaluators, Outcomes). These domains are subdivided into 9 subdomains. Each subdomain includes at least one &#x201C;Support for Judgment&#x201D; and at least one &#x201C;Type of Bias&#x201D; and is to be rated &#x201C;low,&#x201D; &#x201C;high,&#x201D; or &#x201C;unclear&#x201D; for risk of bias. A pilot evaluation was completed by internal validators who were not part of the consensus discussion and were asked to complete the AQAT:RoB form for each assigned study. Each of the 16 studies was evaluated by 2 evaluators independently. Pilot validation showed a percent agreement of 86.1% and a Cohen &#x03BA; of 0.70 between assessors.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The AQAT:RoB demonstrates promising initial reliability for assessing the validity or risk of bias in LLM-QA studies. The tool will benefit from future refinements, external validation, and periodic updates to keep pace with evolving technology.</p></sec></abstract><kwd-group><kwd>risk of bias</kwd><kwd>quality assessment</kwd><kwd>large language model</kwd><kwd>question-answer studies</kwd><kwd>Alberta Risk of Bias Assessment Tool for LLM-QA studies</kwd><kwd>AQAT: RoB</kwd><kwd>chatbot</kwd><kwd>artificial intelligence</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) represent a significant technological advancement with transformative potential across various sectors, including health care. Their capabilities in processing and generating human-like text have led to their rapid emergence as tools capable of assisting in complex medical activities, such as disease diagnosis, clinical decision-making, and even administrative tasks, such as writing prescriptions or assigning billing codes [<xref ref-type="bibr" rid="ref1">1</xref>]. As these sophisticated tools become more integrated into health care ecosystems, robust and rigorous evaluation of their efficacy, safety, and utility is paramount. A critical component of this evaluation involves human assessments, where the performance, usability, and impact of LLM question-answer systems (LLM-QA) such as medical chatbots are gaged through interactions with health care professionals, patients, or simulated users [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>However, the quality of these human evaluation studies varies significantly, with a systematic review indicating that only 5% of studies used real patient care data for LLM evaluation, which can significantly impact the trustworthiness and generalizability of their findings [<xref ref-type="bibr" rid="ref2">2</xref>]. Without a systematic approach to evaluating the quality and risk of bias in studies assessing LLM-QA, the rapid pace of development could outpace the generation of reliable evidence regarding their actual utility and safety in real-world clinical scenarios, potentially leading to premature or even harmful adoption.</p><p>Existing tools for assessing risk of bias, such as the Cochrane Risk of Bias 2 tool (RoB 2) [<xref ref-type="bibr" rid="ref4">4</xref>] for randomized studies, the Quality Assessment of Diagnostic Accuracy Studies 2 (QUADAS-2) [<xref ref-type="bibr" rid="ref5">5</xref>] for diagnostic test studies, the Newcastle-Ottawa Scale (NOS) [<xref ref-type="bibr" rid="ref6">6</xref>], and the Risk of Bias in Non-randomized Studies - of Exposures tool (ROBINS-E) [<xref ref-type="bibr" rid="ref7">7</xref>] for nonrandomized studies, are foundational in their respective areas but fall short when applied to the unique methodological and reporting challenges inherent in human evaluation studies of LLMs. While artificial intelligence (AI)&#x2013;specific quality assessment tools exist, such as the Prediction model Risk of Bias Assessment Tool + AI (PROBAST-AI) [<xref ref-type="bibr" rid="ref8">8</xref>] and APPRAISE-AI [<xref ref-type="bibr" rid="ref9">9</xref>], these focus on studies of prediction models using machine learning and are not applicable to LLM-QA studies.</p><p>The burgeoning interest in the field is evident from the notable surge in studies pertaining to LLM medical chatbots published in recent years, underscoring the topic&#x2019;s emerging relevance and the urgent need for robust evaluation methodologies [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. AI-specific reporting guidelines have been developed to address standardized reporting of AI studies [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>], including a reporting checklist specifically for chatbot health advice studies, CHART (Chatbot Assessment Reporting Tool) [<xref ref-type="bibr" rid="ref13">13</xref>]. However, transparent and comprehensive reporting is only one aspect of quality assessment&#x2014;the other being assessment of risk of bias.</p><p>There is currently no tool available to assess the risk of bias in LLM-QA studies. This gap creates a significant challenge for researchers, clinicians, and policymakers attempting to synthesize evidence and make informed decisions about the integration of medical LLM-QA systems. Without a comprehensive and tailored risk of bias assessment tool, the risk of misinterpreting findings, perpetuating methodological flaws, and drawing unsubstantiated conclusions from human evaluation studies is high. To address this knowledge gap, we took a pragmatic but systematic approach to develop and validate the Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB) for LLM-QA studies for the systematic and comprehensive assessment of the risk of bias in medical LLM-QA studies, addressing aspects unique to this emerging field. The tool is intended to evaluate the quality of studies that involve human participants in assessing the outputs of AI models that utilize natural language interactions.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>The AQAT:RoB development and validation started with the 2 literature reviews. The first was on quality assessment tools for LLM-QA studies, and the second was on LLM-QA studies, which informed the first draft of AQAT:RoB, which went through an iterative process of modified Delphi, consensus meeting, and validation, until our a priori threshold for interrater agreement was met (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB) development and validation. This figure outlines the development and pilot validation process (April-September 2025) that was a priori determined and followed to create the AQAT:RoB. The dotted line implies a possible path that would have been followed had the agreement threshold not been met (though this was ultimately not required). LLMs: large language models; LLM-QA: LLM question-answer; RoB: risk of bias. LLM-QA studies include [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>] and a forthcoming systematic literature review of studies evaluating LLMs for patient-facing health information (protocol registered with PROSPERO; CRD42023461630 [<xref ref-type="bibr" rid="ref16">16</xref>]).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e87057_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The University of Alberta&#x2019;s Research Ethics Board deemed that this project meets one of the conditions described under Chapter 2 of <italic>Tri-Council Policy Statement: Ethical Conduct for Research Involving Humans</italic> (2022) [<xref ref-type="bibr" rid="ref17">17</xref>] as an activity that does not require Research Ethics Board review. The AQAT:RoB has been registered with the LATITUDES Network, which was established to increase the robustness of evidence synthesis by improving the process of validity (risk of bias) assessment [<xref ref-type="bibr" rid="ref18">18</xref>].</p></sec><sec id="s2-3"><title>Literature Review</title><p>We conducted a search query in PubMed in April 2025 to look for existing risk of bias assessment tools specific to LLM-QA studies. The query searched for all studies that contained either of the terms &#x201C;risk of bias&#x201D; or &#x201C;quality assessment&#x201D; with any of the terms &#x201C;large language models,&#x201D; &#x201C;generative AI,&#x201D; or &#x201C;chatbot&#x201D; [(&#x201C;risk of bias&#x201D; OR &#x201C;quality assessment&#x201D;) AND (&#x201C;large language models&#x201D; OR &#x201C;generative AI&#x201D; OR &#x201C;chatbot&#x201D;)]. We limited the search to English studies published in the last 5 years and retrieved 91 results. The search was updated in June 2025, which retrieved 113 studies, and again in September 2025, which retrieved 149 studies. None of these searches included tools for assessing validity or risk of bias in LLM studies. Most studies pertained to the use of LLMs to perform risk of bias assessments. Around the time of each PubMed search, the LATITUDES Network library [<xref ref-type="bibr" rid="ref19">19</xref>] of risk of bias assessment tools and tools in development was searched for tools that pertained to LLM studies; none were found.</p><p>We conducted a literature review of LLM-QA studies to (1) inform the development of AQAT:RoB and (2) find studies for validating the AQAT:RoB. Our group had already conducted, with the assistance of an experienced librarian (DCB), a systematic literature review of studies evaluating LLMs for patient-facing health information (manuscript in progress, protocol registered with PROSPERO [CRD42023461630]) [<xref ref-type="bibr" rid="ref16">16</xref>]. Medline, Embase, Web of Science, CINAHL, PsycINFO, and Google Scholar were searched for studies published up to July 5, 2023, and then updated to March 7, 2024, limiting searches to the last 10 years prior to the search (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Searches were limited to the English language but not to geographic regions. All citations were imported into Covidence for duplicate removal and screening. Abstracts and full texts were each screened independently by 3 reviewers (ZM, ALF, DGO). Disagreements were resolved through a third reviewer (CY). A total of 8943 records were identified, including 2798 duplicates (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). In total, 6145 titles and abstracts were screened with 327 found to be relevant for full-text review. Of these 327 full texts, 40 were deemed to be original research studies pertaining to LLMs for patient education (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><p>We found 2 very recently published systematic reviews by Wang et al [<xref ref-type="bibr" rid="ref11">11</xref>] and Huo et al [<xref ref-type="bibr" rid="ref10">10</xref>], which, along with our systematic review [<xref ref-type="bibr" rid="ref16">16</xref>], we felt covered the breadth of LLM-QA studies and were thus sufficient to inform the development and validation of the AQAT:RoB. In the systematic literature review conducted by Huo et al [<xref ref-type="bibr" rid="ref10">10</xref>], their search of MEDLINE, Embase, and Web of Science from inception to October 27, 2023, resulted in 137 eligible studies evaluating the performance of generative AI-driven chatbots. They found that key aspects of internal validity, such as standardized evaluation process, blinding of evaluators, or reference standards, were not well described or simply not included in the studies [<xref ref-type="bibr" rid="ref10">10</xref>]. Wang et al [<xref ref-type="bibr" rid="ref11">11</xref>] queried PubMed, Embase, Web of Science, and Scopus from inception until October 14, 2024, and found 168 studies on the accuracy of LLMs when answering clinical questions (although one of these studies has since been retracted). They performed a risk of bias assessment using the NOS [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Acknowledging the limitations of using the NOS tool given the limited relevance to these studies, they found that only 40 (23.8%) of 168 studies were assessed as having a low overall risk of bias [<xref ref-type="bibr" rid="ref11">11</xref>].</p></sec><sec id="s2-4"><title>Candidate Item List Generation</title><p>The initial list of items for the AQAT:RoB assessment tool was drafted by CY and MA by reviewing the results and studies of the 3 recent systematic literature reviews [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] to identify potential sources of bias in LLM-QA and by adapting existing foundational risk of bias assessment tools, including RoB 2 [<xref ref-type="bibr" rid="ref20">20</xref>], NOS [<xref ref-type="bibr" rid="ref6">6</xref>], QUADAS-2 [<xref ref-type="bibr" rid="ref21">21</xref>], and ROBINS-E [<xref ref-type="bibr" rid="ref7">7</xref>]. The steering committee (CY, MA, JRM, DCB) further developed the initial list of items to form the first draft of the AQAT:RoB (7 domains, 12 sources of bias, 15 supports). This first draft was used in the modified Delphi procedure described below.</p></sec><sec id="s2-5"><title>Recruitment of Delphi Panelists</title><p>To identify participants for the modified Delphi process, the steering committee recruited medical AI experts through the Alberta Machine Intelligence Institute [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref22">22</xref>], the University of Alberta AI + Health Hub [<xref ref-type="bibr" rid="ref23">23</xref>], and the University of Calgary&#x2019;s Centre for Health Informatics [<xref ref-type="bibr" rid="ref24">24</xref>]. Interested participants were asked to complete an intake questionnaire regarding demographics, training, and related expertise. All applicants were screened, and panelists were selected by the steering committee to ensure appropriate expertise and diversity and representation on the Delphi panel. In total, there were 19 Delphi panelists, including clinicians (8), computer scientists (8), methodologists (4), researchers (qualitative and quantitative, 17), journal editors (2), and patient partners (2). All Delphi participants were instructed to watch 2 videos that provided background information on risk assessment tools prior to initiating the Delphi process.</p></sec><sec id="s2-6"><title>Modified Delphi Process</title><p>The modified Delphi process occurred between May 1 and May 20, 2025. The steering committee created the Delphi survey using Google Forms. Each participant responded to the survey individually. Participants were asked to rate each item&#x2019;s potential as a source of bias on a 5-point scale (from 1=&#x201C;Not a potential source of bias&#x201D; to 5=&#x201C;High potential source of bias&#x201D;). If participants selected either 1=&#x201C;Not a potential source&#x201D; or 2=&#x201C;Unlikely to be a potential source,&#x201D; they were prompted to provide a short explanation. Participants were also encouraged to use the free-text boxes that followed each item to identify any missing potential sources of biases, questions to identify biases (ie, supports), or types of bias identified by listed supports. The threshold for removing items was more than 50% (10/19) of the participants voted the item as 1=&#x201C;Not a potential source&#x201D; or 2=&#x201C;Unlikely to be a potential source,&#x201D; and the threshold for retaining items was less than 50%. Participants were not able to see other participants&#x2019; votes or comments.</p></sec><sec id="s2-7"><title>Changes From the Delphi Process</title><p>Most items (8 of 12) were rated highly for inclusion (ie, more than 70% (14/19) rated as a potential source of bias). Items rated poorly for agreement or for potential source of bias, and all comments provided by the participants in the free-text boxes were considered for changes (eg, removal, modification, or merging). Based on feedback from participants, no domains or items were added or removed, but modifications were made to 5 items.</p></sec><sec id="s2-8"><title>Consensus Meeting</title><p>An online consensus meeting was held on May 22, 2025, chaired by the steering committee. All panelists were invited except for 2 (S Katz and JLJ) who were excluded from the discussion to serve as adjudicators in the validation phase. The consensus meeting was chaired by the steering committee, and 16 participants attended the synchronous consensus meeting. During this meeting, participants were presented with the initial version of texts; suggested modified versions of the text incorporating the feedback from the modified Delphi process, as well as summary statistics of ratings; and provided comments. During the meeting, each potential source of bias was discussed until consensus was reached on inclusion, type of bias(es), and wording. Once consensus was felt to be reached based on the panel discussion, a formal vote was taken, and unanimity was required before moving on to the next item. At the end of each domain, the panel was asked to discuss if there were any additional potential sources of bias pertaining to that domain.</p></sec><sec id="s2-9"><title>Changes From the Consensus Meeting</title><p>The discussions during the consensus process resulted in multiple changes. There was robust discussion about how granular the &#x201C;Types of biases&#x201D; should be, with the group arriving at the conclusion that for the sake of utility and widespread applicability, we would aim for a high-level description of the types of biases. This change affected 5 of 12 &#x201C;Potential Sources of Bias.&#x201D; Furthermore, there was an addition of &#x201C;Support for Judgment&#x201D; for the domain &#x201C;Performance Metrics.&#x201D;</p></sec><sec id="s2-10"><title>Pilot Validation</title><p>We piloted the AQAT:RoB on 16 studies [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref40">40</xref>], randomly selected from the 319 studies (after removal of duplicates) identified in 3 recent systematic literature reviews [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). Random selection was facilitated by listing all 319 studies in alphabetical order of the first author&#x2019;s last name and then using a random number generator (between 1 and 319) to select the 16 studies [<xref ref-type="bibr" rid="ref41">41</xref>]. Four evaluators (S Katz, JLJ, JC, AA) were asked to complete the AQAT:RoB form for each assigned study. None of the 4 evaluators were part of the consensus discussion, but S Katz and JLJ were on the modified Delphi panel. JC and AA were not part of the tool development process prior to the validation step. All the evaluators were physicians across various specialties (rheumatology, radiology, public health, and primary care). Each of the 16 studies was evaluated by 2 evaluators independently. Evaluators were not provided with any standardized training in order to obtain the most conservative estimates of agreement.</p><p>We set an a priori threshold of &#x003E;80% agreement and a Cohen &#x03BA; of &#x2265;0.60, which would demonstrate substantial agreement per Landis and Koch&#x2019;s [<xref ref-type="bibr" rid="ref42">42</xref>] classification system. If we reached this threshold, no further rounds of changes or consensus would be undertaken. If we did not reach this threshold, we planned to pursue further rounds of modified Delphi or consensus or validation until this threshold was achieved (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p></sec><sec id="s2-11"><title>Protocol Deviations in Response to Peer Review</title><p>The steering committee proposed moving the Reporting and Conflict of Interest domains to &#x201C;Additional consideration&#x201D; for the Delphi panel after the first round of validation was completed. The rationale was that Reporting was better assessed by stand-alone reporting checklists, and while reporting may lead to an unclear risk of bias judgment, it does not represent a mechanistic bias similar to the other domains in the AQAT:RoB. Similarly, Conflict of Interest may be a predictor or source of bias rather than a distinct mechanism of bias. We set the voting threshold to make this change at 70% for the Delphi panel and planned to conduct another round of validation if the overall percent agreement and Cohen &#x03BA; after removing these 2 domains did not reach our a priori threshold of &#x003E;80% agreement and Cohen &#x03BA; of &#x003E;0.60. Note that 100% of the panel voted in agreement with this change.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Validation Results</title><p>After the first modified Delphi and consensus panel, we met our threshold with a percent agreement of 82.8% and a Cohen &#x03BA; of 0.63 (calculated across all items; <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). After the second modified Delphi, in response to peer review, during which 2 domains were removed, the percent agreement was 86.1% and Cohen &#x03BA; was 0.70. The domain with the highest level of agreement was Question Selection (agreement: 93.8%, &#x03BA;: 0.86), while the domain with the lowest level of agreement was LLM Answer Selection (agreement: 68.8%, &#x03BA;: 0.30).</p></sec><sec id="s3-2"><title>AQAT:RoB Tool</title><p>The AQAT:RoB tool [<xref ref-type="bibr" rid="ref43">43</xref>] is summarized in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. An easy-to-use version, which is both fillable and printable, is available on the AQAT website [<xref ref-type="bibr" rid="ref43">43</xref>]. <xref ref-type="other" rid="box1">Textbox 1</xref> presents the scope and boundaries of studies for which AQAT:RoB is applicable.</p><boxed-text id="box1"><title> Utilization of Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB).</title><p>Intended users: Anyone who wants to appraise the quality or risk of bias of studies in which there are human evaluations of large language model question-answer systems. It is especially important for researchers during the development and peer review of such studies and during the quality assessment stage of systematic literature reviews and meta-analyses. Other potential users include, but are not limited to, patients, health care providers, journal editors and reviewers, medical technology manufacturers, health system administrators, and policymakers.</p><p>Target studies: Any study that involves the human evaluation of large language models that provide answers to free-text questions including but not limited to:</p><list list-type="bullet"><list-item><p>Patient-facing applications</p><list list-type="bullet"><list-item><p>Medical chatbots</p></list-item><list-item><p>Summary tools (eg, answer questions about imaging reports or doctors&#x2019; reports)</p></list-item></list></list-item><list-item><p>Physician-facing applications</p><list list-type="bullet"><list-item><p>General medical chatbots</p></list-item><list-item><p>Large language model&#x2013;based clinical decision support systems for physicians</p></list-item><list-item><p>Summary tools (eg, answer questions about patient charts)</p></list-item></list></list-item><list-item><p>Research-based applications:</p><list list-type="bullet"><list-item><p>Case finding (eg, find participants based on electronic medical record or electronic health record data and provide justification)</p></list-item><list-item><p>Literature review (eg, analyze and summarize scientific literature)</p></list-item></list></list-item></list></boxed-text><p>The AQAT:RoB consists of 5 high-level domains (Questions, Reference Answers, LLM Answers, Evaluators, and Outcomes). These domains are subdivided into 9 subdomains. Each subdomain includes at least one &#x201C;Support for Judgment&#x201D; and at least one &#x201C;Type of Bias.&#x201D; Further descriptions of potential sources of bias and best methodological practices are outlined in the text below. In cases of missing, partial, or suboptimal reporting of a specific domain or subdomain, the rating of &#x201C;unclear&#x201D; should be assigned. Additional considerations regarding reporting and conflicts of interest are outlined in the <italic>Additional Considerations</italic> section.</p><sec id="s3-2-1"><title>Domain 1: Questions</title><sec id="s3-2-1-1"><title>Question Source</title><p>Supports for Judgment:</p><list list-type="bullet"><list-item><p>If questions were created or generated specifically for the study, describe the method used to create the question dataset, including who created the questions and if the questions are reflective of the intended study objective.</p></list-item><list-item><p>If questions were selected from an existing question source, adequately describe the source to allow an assessment of whether it addresses the intended research question.</p></list-item></list><p>The evaluation of LLM-QA models should be conducted against questions that reflect the intended use case, as deviations can introduce biases. When the deviation between intended use and the question source is substantial, the performance on the proxy task may not be generalizable to the stated application. To minimize this risk, the most effective approach is to source questions directly from the real-world use setting. If this is not possible, external data sources are often used to generate questions. In such cases, researchers must justify the degree to which these sources align with the study&#x2019;s core research question and the tool&#x2019;s intended application. For example, if a study evaluates a tool to be used by patients, but the questions are written by a research team of nonpatients, this could introduce bias, as the language and complexity of the questions may not be representative of the intended user. Furthermore, if questions were pulled from a preexisting source, it should be clearly stated if the test questions were included in the training data of the tested models, as performance may reflect memorization versus true model reasoning.</p></sec><sec id="s3-2-1-2"><title>Question Selection</title><p>Support for Judgment:</p><list list-type="bullet"><list-item><p>If questions were selected from an existing question source, describe the method used to select the questions from the original source (eg, random, consecutive, all, or by certain factors).</p></list-item></list><p>When selecting questions from an existing dataset, sampling can introduce bias as the selected questions may not be representative of the broader population of potential questions. For instance, a selection mechanism that favors questions of a specific length&#x2014;such as those with a short, predefined character count, perhaps to minimize computational costs&#x2014;would systematically exclude longer, more complex questions. Similarly, selecting from a small, nonrandom subset of available options could skew the results, as the chosen questions may not accurately reflect the diversity and range of questions encountered in the tool&#x2019;s intended use case. Therefore, the method for question selection, such as random, purposive, or consecutive sampling, must be clearly reported and justified.</p></sec><sec id="s3-2-1-3"><title>Question Manipulation</title><p>Supports for Judgment</p><list list-type="bullet"><list-item><p>If any questions were manipulated from the original source, describe and justify the rationale for the manipulation.</p></list-item><list-item><p>If any prompting was provided in addition to the index question, report the exact wording of the prompt(s).</p></list-item></list><p>Whether questions are created or extracted from existing sources, researchers may choose to manipulate them for various reasons. For example, slight variations might be introduced to test the model&#x2019;s robustness and bias, assessing how stable its responses are to minor changes in phrasing. Such manipulations are generally less likely to introduce significant bias, as their purpose is to probe the model&#x2019;s inherent stability rather than to alter the nature of the query. Conversely, questions might be manipulated to simplify them for processing by the model. A common example of this is the use of a system prompt that automatically extracts and restructures clinically relevant information before the model attempts to answer. This form of question manipulation, while potentially beneficial for processing, introduces a risk of bias because it may fundamentally alter the user&#x2019;s original query. Or if researchers correct spelling, terminology, or grammatical errors, or split multipart patient questions into separate questions, these changes may augment the performance of the LLM-QA model but not necessarily reflect real-world performance. It is essential that researchers provide both transparency and justification for any question manipulation, as the process could alter the original intent of the question, thereby compromising the validity of the evaluation. Along with direct question manipulation, all prompts, including system prompts, which in and of themselves do not necessarily introduce bias, should be clearly described and should be standardized and stable throughout testing, as differences in prompts may lead to false or misleading performance outcomes.</p></sec></sec></sec><sec id="s3-3"><title>Domain 2: Reference Answers</title><sec id="s3-3-1"><title>Reference Answer Source</title><p>Supports for Judgment:</p><list list-type="bullet"><list-item><p>If reference answers were generated specifically for the study, describe the method used to create the reference answer dataset, including who created the reference answers, and if the answers are reflective of a true reference standard.</p></list-item><list-item><p>If reference answers were selected from an existing reference answer source, adequately describe the source to allow an assessment of whether it is reflective of a true reference standard.</p></list-item></list><p>Often, LLM-QA studies benchmark LLM outputs against reference answers. Bias may be introduced if the reference answers do not accurately reflect a &#x201C;true&#x201D; or expected standard. For instance, if reference answers were created by individuals with a different level of expertise or with a different format or standard than the true reference standard, the reference standard may not be valid. For example, if the research team created the reference answers to a higher or lower standard than real-world physician-level responses, the reference standard would be misaligned with the intended quality benchmark. A mismatch in language, structure, style, or level of detail between the study reference standard used and the &#x201C;true&#x201D; real-world reference standard can lead to biased results. As a single ground truth does not always exist in medicine, the selected reference standard should be decided a priori (eg, guideline-based or expert consensus-based) and described and justified.</p></sec><sec id="s3-3-2"><title>Reference Answer Selection</title><p>Support for Judgment:</p><list list-type="bullet"><list-item><p>If not all reference answers to a given question were used, describe the method by which reference answers were selected.</p></list-item></list><p>Just as with question selection, the process of selecting reference answers from a larger pool can introduce sampling bias. This bias occurs if the selection method systematically favors answers with certain qualities, making the final set of reference answers unrepresentative of the full range of possible correct responses. For example, if a question has multiple valid reference answers but researchers consistently choose those with a specific tone or level of detail, the evaluation will be skewed toward models that produce similar outputs. Therefore, it is critical to describe and justify the method used for selecting reference answers.</p></sec></sec><sec id="s3-4"><title>Domain 3: LLM Answers</title><p>Support for Judgment:</p><list list-type="bullet"><list-item><p>Describe how many answers were generated for each question and if not all answers were assessed, describe how answers were selected for assessment.</p></list-item></list><p>When evaluating language models, it is often prudent to generate multiple answers for a single question to assess the model&#x2019;s stability or to explore the diversity of its outputs. In such instances, only evaluating a subset of the generated answers (eg, choosing the best one) may not be representative of the model&#x2019;s typical performance, thereby leading to an inaccurate or misleading evaluation. Therefore, it is crucial for researchers to transparently describe and justify how many answers were generated for each question and, if not all of them were assessed, to detail the specific methodology used to select the answers for evaluation.</p></sec><sec id="s3-5"><title>Domain 4: Evaluators</title><sec id="s3-5-1"><title>Evaluator Selection</title><p>Support for Judgment</p><list list-type="bullet"><list-item><p>Describe the method used to select evaluators, and assign evaluators to specific LLM qualities.</p></list-item></list><p>The selection of evaluators should reflect the intended real-world use and the required expertise to judge the domains being assessed. For example, having a physician evaluate the empathy of an LLM-QA&#x2019;s outputs may not reflect how a patient would assess this domain. Likewise, it would not be appropriate for a patient to evaluate the accuracy of health information generated by an LLM-QA, as they would lack the appropriate expertise. As many studies assess multiple outcomes, more than one type of evaluator may be required for a given study (eg, physicians evaluate accuracy and patients rate readability). Furthermore, the demographic or professional characteristics of the evaluators should align with the intended user population of the LLM or chatbot. For example, if an LLM is designed for a general audience but its readability is evaluated exclusively by individuals with advanced academic degrees, the results may not accurately reflect how an average user would perceive the content.</p></sec><sec id="s3-5-2"><title>Blinding of Evaluators</title><p>Support for Judgment:</p><list list-type="bullet"><list-item><p>Describe all measures used, if any, to blind trial evaluators and researchers from knowledge of the answer source. Provide information relating to whether the intended blinding was effective.</p></list-item></list><p>The integrity of an evaluation can be compromised if evaluators are not blinded to the source of the answers they are assessing (reference standard vs LLM-generated) because evaluators&#x2019; preexisting beliefs, attitudes, or knowledge about a specific technology, such as AI, or even to a specific LLM model, can unconsciously influence their ratings. Thus, evaluators should be blinded to the answer source, and researchers should describe the blinding measures employed. Since naive blinding is not guaranteed to be effective given stylistic markers in LLM-generated text, authors should describe the steps taken to assess or verify the effectiveness of the blinding (eg, ask the evaluators if they could identify the AI-generated answer).</p></sec></sec><sec id="s3-6"><title>Domain 5: Outcomes&#x2014;Performance Metrics</title><p>Supports for Judgment:</p><list list-type="bullet"><list-item><p>Describe specific metrics used for each outcome quality.</p></list-item><list-item><p>Describes if desired outcomes were prespecified prior to conducting the study.</p></list-item></list><p>To minimize the risk of bias, 2 crucial steps should be taken. First, the desired outcomes or hypotheses of the study should be prespecified prior to conducting any analysis. Second, the metrics selected to measure each outcome must directly align with the stated goals of the study. For example, if the goal is to evaluate a chatbot&#x2019;s ability to provide concise summaries of medical information, metrics should focus on conciseness and accuracy, rather than on secondary qualities, such as conversational tone or creativity. A misalignment between metrics and study goals introduces bias, as the evaluation would not accurately reflect the model&#x2019;s performance on its intended task.</p></sec><sec id="s3-7"><title>Additional Considerations</title><p>Complete and transparent reporting is required to judge the risk of bias. Researchers must account for any instances of missing data and describe how missing data were handled. For example, if certain questions were too long for the model to process, or if the model failed to produce a response, these omissions should be explicitly noted and their potential impact on the evaluation should be discussed. Similarly, if human evaluators did not complete all of their annotations, it is important that this missingness is reported and ideally investigated, as these instances of missingness may not be random and could introduce bias if not accounted for. Study outcomes should be decided <italic>a priori</italic> and deviations should be described and justified. By reporting a subset of all measured outcomes or manipulating the analysis post hoc to achieve a different result (eg, by recategorizing certain groups, shifting the scale), a study may present a distorted or optimistic view of model performance. The use of appropriate reporting checklists is recommended.</p><p>Conflicts of interest (especially commercial interests) may introduce explicit or subconscious biases in the formulation of the problem, the execution of the analysis, or the interpretation of the results. If there are conflicts of interest (eg, authors funded or affiliated with model vendors), they must be disclosed and mitigated, if possible.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The AQAT:RoB tool was developed to standardize the quality assessment and specifically, the risk of bias assessments, of studies in which there are human evaluations of LLM-QA systems. This easy-to-use risk of bias assessment tool covers 5 major domains (Questions, Reference answers, LLM answers, Evaluators, Outcomes, Reporting, and Other), with a total of 9 potential sources of bias, each with 1&#x2010;2 support for judgment prompts and a list of types of potential bias. In our pilot validation by 4 assessors of 16 studies, the AQAT:RoB showed a substantial degree of agreement between internal validators blinded to the development process.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>As demonstrated in <xref ref-type="table" rid="table1">Table 1</xref>, the AQAT:RoB assesses many important aspects of LLM-QA studies that are either not covered at all or only tangentially covered by other foundational risk of bias tools [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. Most notably, Domain 1: Questions, a very important potential source of bias, is not addressed by any of the foundational tools. We also note that the panel explicitly discussed and voted to include conflict of interest in the AQAT:RoB, which is not always included in foundational tools, given the increasing commercialization of natural language processing research. The vast majority of computer science faculty at top schools have financial conflicts with industry [<xref ref-type="bibr" rid="ref44">44</xref>], and the field of natural language processing is so reliant on industry artifacts to the point of being described as &#x201C;captured&#x201D; [<xref ref-type="bibr" rid="ref45">45</xref>]. This is particularly relevant in evaluation works, as past peer-reviewed evaluations have then been used by the relevant industry party to claim that their models have been &#x201C;independently audited&#x201D; [<xref ref-type="bibr" rid="ref46">46</xref>]. The foundational tools listed in <xref ref-type="table" rid="table1">Table 1</xref> [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>] either do not provide a threshold for determining overall risk of bias or use a &#x201C;worst-of&#x201D; approach, where the overall risk of bias is considered &#x201C;High risk&#x201D; if at least 1 domain is judged as &#x201C;High risk.&#x201D; We have chosen to leave the determination of overall risk of bias to the discretion of the user, as the threshold may be different depending on the intended use of the tool, although in most cases, a single domain being judged as &#x201C;High risk&#x201D; would likely result in an overall judgment of &#x201C;High risk.&#x201D;</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB) and foundational risk of bias tools<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">;AQAT:RoB domain</td><td align="left" valign="bottom">RoB 2<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> [<xref ref-type="bibr" rid="ref4">4</xref>]</td><td align="left" valign="bottom">NOS<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> [<xref ref-type="bibr" rid="ref6">6</xref>]</td><td align="left" valign="bottom">QUADAS-2<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup> [<xref ref-type="bibr" rid="ref5">5</xref>]</td><td align="left" valign="bottom">ROBINS-E<sup><xref ref-type="table-fn" rid="table1fn5">e</xref></sup> [<xref ref-type="bibr" rid="ref7">7</xref>]</td></tr></thead><tbody><tr><td align="left" valign="top">Questions</td><td align="left" valign="top">x<sup><xref ref-type="table-fn" rid="table1fn6">f</xref></sup></td><td align="left" valign="top">x</td><td align="left" valign="top">x</td><td align="left" valign="top">x</td></tr><tr><td align="left" valign="top">Reference answers</td><td align="left" valign="top">x</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 1: Selection</p></list-item><list-item><p>Domain 2: Comparability</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 3: Reference standard</p></list-item><list-item><p>Domain 4: Flow and timing</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 2: Risk of bias arising from measurement of the exposure</p></list-item></list></td></tr><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table1fn7">g</xref></sup> answers</td><td align="left" valign="top">x</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 1: Selection</p></list-item><list-item><p>Domain 2: Comparability</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 2: Index test or tests</p></list-item><list-item><p>Domain 4: Flow and timing</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 2: Risk of bias arising from measurement of the exposure</p></list-item></list></td></tr><tr><td align="left" valign="top">Evaluators</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 4: Risk of bias in measurement of the outcome</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 1: Selection</p></list-item><list-item><p>Domain 3: Outcomes</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 1: Patient selection</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 3: Risk of bias in selection of participants into the study (or into the analysis)</p></list-item></list></td></tr><tr><td align="left" valign="top">Outcomes</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 4: Risk of bias in measurement of the outcome</p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 3: Outcomes</p></list-item></list></td><td align="left" valign="top">x</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Domain 6: Risk of bias arising from measurement of the outcome</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>This table compares the coverage of key domains determined to be important for assessing risk of bias in large language model question-answer studies by the Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB)  compared with existing foundational risk of bias tools. No AQAT:RoB domains were covered adequately by other Foundational Risk of Bias tools. Terms in italics signify the closest domain found in the foundational tool.</p></fn><fn id="table1fn2"><p><sup>b</sup>RoB 2: Cochrane Risk of Bias 2 tool.</p></fn><fn id="table1fn3"><p><sup>c</sup>NOS: Newcastle-Ottawa Scale.</p></fn><fn id="table1fn4"><p><sup>d</sup>QUADAS-2: Quality Assessment of Diagnostic Accuracy Studies 2.</p></fn><fn id="table1fn5"><p><sup>e</sup>ROBINS-E: Risk of Bias in Non-randomized Studies - of Exposures.</p></fn><fn id="table1fn6"><p><sup>f</sup>x: not covered by the tool.</p></fn><fn id="table1fn7"><p><sup>g</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><p>The AQAT:RoB is the first risk of bias assessment tool specifically designed for studies of human-evaluated LLM-QA systems. Despite this, systematic reviews of LLM-QA studies have already been published [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], highlighting the urgency of the need for the AQAT:RoB. Previous AI evaluation frameworks primarily function as reporting checklists [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. While these are invaluable for assessing transparency and reproducibility, they do not directly evaluate a study&#x2019;s susceptibility to bias. For instance, a study may meticulously report every detail, yet still contain high-risk elements, such as a lack of blinding for answer sources, that could compromise its findings. Existing AI-specific quality assessment tools, such as PROBAST-AI [<xref ref-type="bibr" rid="ref8">8</xref>] and APPRAISE-AI [<xref ref-type="bibr" rid="ref9">9</xref>], are tailored for predictive machine learning models, which differ significantly from the evaluation needs of LLM-QA studies. Templin et al [<xref ref-type="bibr" rid="ref47">47</xref>] introduced a useful 5-step framework for auditing LLMs, but not for assessing the validity of LLM-QA studies. Therefore, the AQAT:RoB addresses a critical void in the standardized evaluation of LLM-QA research.</p><p>Aiming to address the urgent need for a risk of bias assessment tool for this growing field and aided by our own existing [<xref ref-type="bibr" rid="ref16">16</xref>] and recently published systematic reviews [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>], we sought to develop the AQAT:RoB through a systematic, but pragmatic approach. With a highly engaged group of interdisciplinary experts, our pilot validation was able to achieve high interrater agreement after the first round of modified Delphi and consensus. In our pilot validation of 16 studies, the interrater reliability was on par with or better than that of existing foundational risk of bias tools. For example, studies have found that the RoB 2 demonstrated kappas consistent with &#x201C;fair&#x201D; agreement (0.21&#x2010;0.40) [<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Another study found that while the interrater reliability across 6 risk of bias tools for nonrandomized studies varied widely, most demonstrated intraclass correlation coefficients in the substantial range, similar to the AQAT:RoB [<xref ref-type="bibr" rid="ref50">50</xref>]. No studies have demonstrated the interrater reliability of existing RoB tools specifically to LLM-QA studies.</p></sec><sec id="s4-3"><title>Limitations</title><p>We recognize that there are limitations to the AQAT:RoB. First, while it was developed by a wide interdisciplinary group of experts and patient partners, most experts were from Alberta due to the nature of the AQAT collaborative, which includes the development of other quality assessment tools for AI-related studies, including the development of publicly available datasets, validated evaluation scales, and measurements and programs. In future updates of the AQAT:RoB, we plan to engage a more international group of partners. It will also be crucial for the AQAT:RoB to be extensively and externally validated by the broader international community of researchers [<xref ref-type="bibr" rid="ref51">51</xref>]. Furthermore, the AQAT:RoB was developed with a medical focus and would require validation and adaptation for nonmedical studies of LLM-QA. Second, we recognize that this tool will need to evolve with the rapid development of LLM tools and related studies. Third, this tool was developed in English and evaluated only on English-language studies. In order to use this tool on non-English&#x2013;language studies, it would ideally be translated and validated in other languages. Multilingual and non-English evaluations may require an expansion of the support for judgments to be considered when classifying the risk of bias. For example, if questions or reference answers are translated from English, they may not accurately reflect the distribution against which they will be evaluated (ie, native, nontranslated questions).</p><p>Finally, the pilot validation has limitations. It included only 16 studies, a relatively small sample, which may limit reliability estimates. While construct validity was supported by expert consensus and signaling questions that map directly to known mechanisms of bias, our pilot validation focused on interrater reliability, the most commonly reported evaluation metric for such tools. To further support construct validity, future evaluations that test concurrent and criterion validity are needed, although the lack of truly comparative tools and meta-analyses in this field limits the feasibility of these types of evaluations. Finally, reliance on an aggregate score for our a priori threshold makes it possible for domains with high agreement to compensate for domains with poor agreement. We acknowledge that agreement for the LLM Answer Selection domain was lower or less reliable than for the other domains and remains experimental, requiring future refinements after more extensive validation.</p></sec><sec id="s4-4"><title>Future Directions</title><p>The AQAT:RoB is a crucial next step toward standardizing the evaluation of LLM-QA studies in medicine. While we acknowledge that the tool will benefit from future refinements, more extensive validation (particularly external validation), and periodic updates to keep pace with evolving technology, we believe it currently fills an urgent need and critical gap. The immediate application of this tool will enable researchers, clinicians, and policymakers to more effectively and rigorously assess the validity of LLM-based studies, thereby ensuring that real-world applications of this technology are built on a solid foundation of reliable evidence.</p></sec><sec id="s4-5"><title>Conclusions</title><p>The AQAT:RoB demonstrates promising initial reliability for assessing the validity or risk of bias of LLM-QA studies.</p></sec></sec></body><back><ack><p>We thank the Alberta Machine Intelligence Institute, AI+Health Hub, and the Centre for Health Informatics for providing administrative and recruitment support. We thank Ms Dagmara Chojecki, MLIS, health sciences librarian, for her assistance with the systematic literature review.</p></ack><notes><sec><title>Funding</title><p>This research is supported by the Canadian Institutes of Health Research (number 96047). MA is supported by a Canada CIFAR AI Chair. RM is supported by a Canada CIFAR AI Chair and the Alberta Health Services Chair in AI in Health. LM is supported by NSERC and Canada CIFAR AI Chair. RG is supported by Amii and NSERC. JM is supported by a Canada CIFAR AI Chair and by Medical Imaging Consultants. SK is supported by the Kidney Health Research Chair and the Division of Nephrology at the University of Alberta.</p></sec><sec><title>Data Availability</title><p>The data extracted and synthesized for this study are available in the multimedia appendix files.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: CY</p><p>Data curation: CY, MA</p><p>Formal analysis: CY, MA</p><p>Investigation: AA, ALF, CEHB, CY, DCB, DGO, FM, GL, HJ, JC, JLJ, JRM, LM, MA, MM, MWBG, PB, PT, RC, RG, S Katz, S Klarenbach, WH, ZM</p><p>Methodology: CY, DCB, JRM, MA</p><p>Writing &#x2013; original draft: CY, MA</p><p>Writing &#x2013; review &#x0026; editing: AA, ALF, CEHB, CY, DCB, DGO, FM, GL, HJ, JC, JLJ, JRM, LM, MA, MM, MWBG, PB, PT, RC, RG, S Katz, S Klarenbach, WH, ZM</p></fn><fn fn-type="conflict"><p>S Klarenbach is Director of the Real World Evidence Consortium, and Alberta Drug and Therapeutic Evaluation Consortium (Universities of Alberta, Calgary, and Institute of Health Economics); these entities receive funding from decision-makers and industry to conduct research. All research funding is made to the academic institution; investigators retain full rights of academic freedom and right to publish. This relationship is not related to the current work. All other authors declare no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AQAT:RoB</term><def><p>Alberta Quality Assessment Tool: Risk of Bias</p></def></def-item><def-item><term id="abb3">CHART</term><def><p>Chatbot Assessment Reporting Tool</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">LLM-QA</term><def><p> LLM question-answer system</p></def></def-item><def-item><term id="abb6">NOS</term><def><p>Newcastle-Ottawa Scale</p></def></def-item><def-item><term id="abb7">PROBAST-AI</term><def><p>Prediction model Risk of Bias Assessment Tool</p></def></def-item><def-item><term id="abb8">QUADAS-2</term><def><p>Quality Assessment of Diagnostic Accuracy Studies 2</p></def></def-item><def-item><term id="abb9">RoB 2</term><def><p>Cochrane Risk of Bias 2 tool</p></def></def-item><def-item><term id="abb10">ROBINS-E</term><def><p>Risk of Bias in Non-randomized Studies - of Exposures tool</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iqbal</surname><given-names>U</given-names> </name><name name-style="western"><surname>Tanweer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rahmanti</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Greenfield</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>LTJ</given-names> </name><name name-style="western"><surname>Li</surname><given-names>YCJ</given-names> </name></person-group><article-title>Impact of large language model (ChatGPT) in healthcare: an umbrella review and evidence synthesis</article-title><source>J Biomed Sci</source><year>2025</year><month>05</month><day>7</day><volume>32</volume><issue>1</issue><fpage>45</fpage><pub-id pub-id-type="doi">10.1186/s12929-025-01131-z</pub-id><pub-id pub-id-type="medline">40335969</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Entwistle</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pfeffer</surname><given-names>MA</given-names> </name></person-group><article-title>Creation and adoption of large language models in medicine</article-title><source>JAMA</source><year>2023</year><month>09</month><day>5</day><volume>330</volume><issue>9</issue><fpage>866</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.14217</pub-id><pub-id pub-id-type="medline">37548965</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sterne</surname><given-names>JAC</given-names> </name><name name-style="western"><surname>Savovi&#x0107;</surname><given-names>J</given-names> </name><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><etal/></person-group><article-title>RoB 2: a revised tool for assessing risk of bias in randomised trials</article-title><source>BMJ</source><year>2019</year><month>08</month><day>28</day><volume>366</volume><fpage>l4898</fpage><pub-id pub-id-type="doi">10.1136/bmj.l4898</pub-id><pub-id pub-id-type="medline">31462531</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Whiting</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Rutjes</surname><given-names>AWS</given-names> </name><name name-style="western"><surname>Westwood</surname><given-names>ME</given-names> </name><etal/></person-group><article-title>QUADAS-2: a revised tool for the quality assessment of diagnostic accuracy studies</article-title><source>Ann Intern Med</source><year>2011</year><month>10</month><day>18</day><volume>155</volume><issue>8</issue><fpage>529</fpage><lpage>536</lpage><pub-id pub-id-type="doi">10.7326/0003-4819-155-8-201110180-00009</pub-id><pub-id pub-id-type="medline">22007046</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lo</surname><given-names>CKL</given-names> </name><name name-style="western"><surname>Mertz</surname><given-names>D</given-names> </name><name name-style="western"><surname>Loeb</surname><given-names>M</given-names> </name></person-group><article-title>Newcastle-Ottawa Scale: comparing reviewers&#x2019; to authors&#x2019; assessments</article-title><source>BMC Med Res Methodol</source><year>2014</year><month>04</month><day>1</day><volume>14</volume><fpage>45</fpage><pub-id pub-id-type="doi">10.1186/1471-2288-14-45</pub-id><pub-id pub-id-type="medline">24690082</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Higgins</surname><given-names>JPT</given-names> </name><name name-style="western"><surname>Morgan</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Rooney</surname><given-names>AA</given-names> </name><etal/></person-group><article-title>A tool to assess risk of bias in non-randomized follow-up studies of exposure effects (ROBINS-E)</article-title><source>Environ Int</source><year>2024</year><month>04</month><volume>186</volume><fpage>108602</fpage><pub-id pub-id-type="doi">10.1016/j.envint.2024.108602</pub-id><pub-id pub-id-type="medline">38555664</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Andaur Navarro</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Protocol for development of a reporting guideline (TRIPOD-AI) and risk of bias tool (PROBAST-AI) for diagnostic and prognostic prediction model studies based on artificial intelligence</article-title><source>BMJ Open</source><year>2021</year><month>07</month><day>9</day><volume>11</volume><issue>7</issue><fpage>e048008</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2020-048008</pub-id><pub-id pub-id-type="medline">34244270</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kwong</surname><given-names>JCC</given-names> </name><name name-style="western"><surname>Khondker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lajkosz</surname><given-names>K</given-names> </name><etal/></person-group><article-title>APPRAISE-AI tool for quantitative evaluation of AI studies for clinical decision support</article-title><source>JAMA Netw Open</source><year>2023</year><month>09</month><day>5</day><volume>6</volume><issue>9</issue><fpage>e2335377</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.35377</pub-id><pub-id pub-id-type="medline">37747733</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marfo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models for chatbot health advice studies: a systematic review</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2457879</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.57879</pub-id><pub-id pub-id-type="medline">39903463</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Accuracy of large language models when answering clinical research questions: systematic review and network meta-analysis</article-title><source>J Med Internet Res</source><year>2025</year><month>04</month><day>30</day><volume>27</volume><fpage>e64486</fpage><pub-id pub-id-type="doi">10.2196/64486</pub-id><pub-id pub-id-type="medline">40305085</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><etal/></person-group><article-title>TRIPOD+AI statement: updated guidance for reporting clinical prediction models that use regression or machine learning methods</article-title><source>BMJ</source><year>2024</year><month>04</month><day>16</day><volume>385</volume><fpage>e078378</fpage><pub-id pub-id-type="doi">10.1136/bmj-2023-078378</pub-id><pub-id pub-id-type="medline">38626948</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>CHART Collaborative</collab></person-group><article-title>Reporting guideline for chatbot health advice studies: the Chatbot Assessment Reporting Tool (CHART) statement</article-title><source>BMJ Med</source><year>2025</year><volume>4</volume><issue>1</issue><fpage>e001632</fpage><pub-id pub-id-type="doi">10.1136/bmjmed-2025-001632</pub-id><pub-id pub-id-type="medline">40761518</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>TI</given-names> </name><name name-style="western"><surname>Malin</surname><given-names>B</given-names> </name><name name-style="western"><surname>Klement</surname><given-names>W</given-names> </name><name name-style="western"><surname>Eysenbach</surname><given-names>G</given-names> </name></person-group><article-title>Consolidated reporting guidelines for prognostic and diagnostic machine learning models (CREMLS)</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>2</day><volume>26</volume><fpage>e52508</fpage><pub-id pub-id-type="doi">10.2196/52508</pub-id><pub-id pub-id-type="medline">38696776</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>The use of large language models in patient education interventions: a systematic review</article-title><source>PROSPERO</source><access-date>2025-09-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.crd.york.ac.uk/PROSPERO/view/CRD42023461630">https://www.crd.york.ac.uk/PROSPERO/view/CRD42023461630</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>TCPS 2 (2022) &#x2013; chapter 2: scope and approach</article-title><source>Government of Canada</source><year>2023</year><access-date>2026-03-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ethics.gc.ca/eng/tcps2-eptc2_2022_chapter2-chapitre2.html">https://ethics.gc.ca/eng/tcps2-eptc2_2022_chapter2-chapitre2.html</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Tools in development</article-title><source>Latitudes Network</source><year>2023</year><access-date>2025-09-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.latitudes-network.org/library/tools-in-development/">https://www.latitudes-network.org/library/tools-in-development/</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><source>Latitudes Network</source><year>2023</year><access-date>2025-09-16</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.latitudes-network.org/">https://www.latitudes-network.org/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>RoB 2 tool</article-title><source>Risk of bias tools</source><access-date>2025-06-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://sites.google.com/site/riskofbiastool/welcome/rob-2-0-tool?authuser=0">https://sites.google.com/site/riskofbiastool/welcome/rob-2-0-tool?authuser=0</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>QUADAS | Bristol medical school: population health sciences</article-title><source>University of Bristol</source><access-date>2025-06-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.bristol.ac.uk/population-health-sciences/projects/quadas/">https://www.bristol.ac.uk/population-health-sciences/projects/quadas/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><source>Alberta Machine Intelligence Institute</source><access-date>2025-09-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.amii.ca/">https://www.amii.ca/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>AI + Health Hub</article-title><source>University of Alberta</source><access-date>2025-09-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ualberta.ca/en/health-sciences/research/ai-and-health-hub.html">https://www.ualberta.ca/en/health-sciences/research/ai-and-health-hub.html</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Centre for health informatics</article-title><source>University of Calgary</source><access-date>2025-09-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cumming.ucalgary.ca/centres/centre-health-informatics">https://cumming.ucalgary.ca/centres/centre-health-informatics</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bernstein</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>YV</given-names> </name><name name-style="western"><surname>Govil</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Comparison of ophthalmologist and large language model chatbot responses to online patient eye care questions</article-title><source>JAMA Netw Open</source><year>2023</year><month>08</month><day>1</day><volume>6</volume><issue>8</issue><fpage>e2330320</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.30320</pub-id><pub-id pub-id-type="medline">37606922</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cappellani</surname><given-names>F</given-names> </name><name name-style="western"><surname>Card</surname><given-names>KR</given-names> </name><name name-style="western"><surname>Shields</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Pulido</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Haller</surname><given-names>JA</given-names> </name></person-group><article-title>Reliability and accuracy of artificial intelligence ChatGPT in providing information on ophthalmic diseases and management to patients</article-title><source>Eye</source><year>2024</year><month>05</month><volume>38</volume><issue>7</issue><fpage>1368</fpage><lpage>1373</lpage><pub-id pub-id-type="doi">10.1038/s41433-023-02906-0</pub-id><pub-id pub-id-type="medline">38245622</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chaker</surname><given-names>SC</given-names> </name><name name-style="western"><surname>Hung</surname><given-names>YC</given-names> </name><name name-style="western"><surname>Saad</surname><given-names>M</given-names> </name><name name-style="western"><surname>Golinko</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Galdyn</surname><given-names>IA</given-names> </name></person-group><article-title>Easing the burden on caregivers&#x2014;applications of artificial intelligence for physicians and caregivers of children with cleft lip and palate</article-title><source>Cleft Palate Craniofac J</source><year>2025</year><month>04</month><volume>62</volume><issue>4</issue><fpage>574</fpage><lpage>587</lpage><pub-id pub-id-type="doi">10.1177/10556656231223596</pub-id><pub-id pub-id-type="medline">38178785</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kann</surname><given-names>BH</given-names> </name><name name-style="western"><surname>Foote</surname><given-names>MB</given-names> </name><etal/></person-group><article-title>The utility of ChatGPT for cancer treatment information</article-title><source>medRxiv</source><comment>Preprint posted online on  Mar 23, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.03.16.23287316</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chervenak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lieman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Blanco-Breindel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jindal</surname><given-names>S</given-names> </name></person-group><article-title>The promise and peril of using a large language model to obtain clinical information: ChatGPT performs strongly as a fertility counseling tool with limitations</article-title><source>Fertil Steril</source><year>2023</year><month>09</month><volume>120</volume><issue>3</issue><fpage>575</fpage><lpage>583</lpage><pub-id pub-id-type="doi">10.1016/j.fertnstert.2023.05.151</pub-id><pub-id pub-id-type="medline">37217092</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coskun</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ocakoglu</surname><given-names>G</given-names> </name><name name-style="western"><surname>Yetemen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kaygisiz</surname><given-names>O</given-names> </name></person-group><article-title>Can ChatGPT, an artificial intelligence language model, provide accurate and high-quality patient information on prostate cancer?</article-title><source>Urology</source><year>2023</year><month>10</month><volume>180</volume><fpage>35</fpage><lpage>58</lpage><pub-id pub-id-type="doi">10.1016/j.urology.2023.05.040</pub-id><pub-id pub-id-type="medline">37406864</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gabriel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shafik</surname><given-names>L</given-names> </name><name name-style="western"><surname>Alanbuki</surname><given-names>A</given-names> </name><name name-style="western"><surname>Larner</surname><given-names>T</given-names> </name></person-group><article-title>The utility of the ChatGPT artificial intelligence tool for patient education and enquiry in robotic radical prostatectomy</article-title><source>Int Urol Nephrol</source><year>2023</year><month>11</month><volume>55</volume><issue>11</issue><fpage>2717</fpage><lpage>2732</lpage><pub-id pub-id-type="doi">10.1007/s11255-023-03729-4</pub-id><pub-id pub-id-type="medline">37528247</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>HY</given-names> </name><name name-style="western"><surname>Alessandri Bonetti</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pandya</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>VT</given-names> </name><name name-style="western"><surname>Egro</surname><given-names>FM</given-names> </name></person-group><article-title>Dr. ChatGPT will see you now: how do Google and ChatGPT compare in answering patient questions on breast reconstruction?</article-title><source>J Plast Reconstr Aesthet Surg</source><year>2023</year><month>10</month><volume>85</volume><fpage>488</fpage><lpage>497</lpage><pub-id pub-id-type="doi">10.1016/j.bjps.2023.07.039</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kianian</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Giaconi</surname><given-names>J</given-names> </name></person-group><article-title>Can ChatGPT aid clinicians in educating patients on the surgical management of glaucoma?</article-title><source>J Glaucoma</source><year>2024</year><month>02</month><day>1</day><volume>33</volume><issue>2</issue><fpage>94</fpage><lpage>100</lpage><pub-id pub-id-type="doi">10.1097/IJG.0000000000002338</pub-id><pub-id pub-id-type="medline">38031276</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCarthy</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Berkowitz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ramalingam</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ahmed</surname><given-names>M</given-names> </name></person-group><article-title>Evaluation of an artificial intelligence chatbot for delivery of IR patient education material: a comparison with societal website content</article-title><source>J Vasc Interv Radiol</source><year>2023</year><month>10</month><volume>34</volume><issue>10</issue><fpage>1760</fpage><lpage>1768</lpage><pub-id pub-id-type="doi">10.1016/j.jvir.2023.05.037</pub-id><pub-id pub-id-type="medline">37330210</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Padovan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cosci</surname><given-names>B</given-names> </name><name name-style="western"><surname>Petillo</surname><given-names>A</given-names> </name><etal/></person-group><article-title>ChatGPT in occupational medicine: a comparative study with human experts</article-title><source>Bioengineering (Basel)</source><year>2024</year><month>01</month><day>6</day><volume>11</volume><issue>1</issue><fpage>57</fpage><pub-id pub-id-type="doi">10.3390/bioengineering11010057</pub-id><pub-id pub-id-type="medline">38247934</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thia</surname><given-names>I</given-names> </name><name name-style="western"><surname>Saluja</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT: is this patient education tool for urological malignancies readable for the general population?</article-title><source>Res Rep Urol</source><year>2024</year><volume>16</volume><fpage>31</fpage><lpage>37</lpage><pub-id pub-id-type="doi">10.2147/RRU.S440633</pub-id><pub-id pub-id-type="medline">38259300</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mayo-Y&#x00E1;&#x00F1;ez</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lechien</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Maria-Saibene</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vaira</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Maniaci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Chiesa-Estomba</surname><given-names>CM</given-names> </name></person-group><article-title>Examining the performance of ChatGPT 3.5 and Microsoft Copilot in otolaryngology: a comparative study with otolaryngologists&#x2019; evaluation</article-title><source>Indian J Otolaryngol Head Neck Surg</source><year>2024</year><month>08</month><volume>76</volume><issue>4</issue><fpage>3465</fpage><lpage>3469</lpage><pub-id pub-id-type="doi">10.1007/s12070-024-04729-1</pub-id><pub-id pub-id-type="medline">39130248</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pressman</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Borna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gomez-Cabello</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Haider</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Forte</surname><given-names>AJ</given-names> </name></person-group><article-title>AI in hand surgery: assessing large language models in the classification and management of hand injuries</article-title><source>J Clin Med</source><year>2024</year><month>05</month><day>11</day><volume>13</volume><issue>10</issue><fpage>2832</fpage><pub-id pub-id-type="doi">10.3390/jcm13102832</pub-id><pub-id pub-id-type="medline">38792374</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesnier</surname><given-names>J</given-names> </name><name name-style="western"><surname>Suc</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sayah</surname><given-names>N</given-names> </name><name name-style="western"><surname>Abtan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Steg</surname><given-names>PG</given-names> </name></person-group><article-title>Relevance of medical information obtained from ChatGPT: are large language models friends or foes?</article-title><source>Arch Cardiovasc Dis</source><year>2023</year><month>10</month><volume>116</volume><issue>10</issue><fpage>485</fpage><lpage>486</lpage><pub-id pub-id-type="doi">10.1016/j.acvd.2023.07.009</pub-id><pub-id pub-id-type="medline">37718185</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rizwan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sadiq</surname><given-names>T</given-names> </name></person-group><article-title>The use of AI in diagnosing diseases and providing management plans: a consultation on cardiovascular disorders with ChatGPT</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e43106</fpage><pub-id pub-id-type="doi">10.7759/cureus.43106</pub-id><pub-id pub-id-type="medline">37692649</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Haahr</surname><given-names>M</given-names> </name></person-group><source>RANDOM.ORG</source><access-date>2025-09-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.random.org/">https://www.random.org/</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="web"><source>Alberta Quality Assessment Tools</source><access-date>2026-03-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://aqat.ai/#tools">https://aqat.ai/#tools</ext-link></comment></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Abdalla</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abdalla</surname><given-names>M</given-names> </name></person-group><article-title>The grey hoodie project: big tobacco, big tech, and the threat on academic integrity</article-title><year>2021</year><conf-name>AIES &#x2019;21: Proceedings of the 2021 AAAI/ACM Conference on AI, Ethics, and Society</conf-name><conf-date>May 19-21, 2021</conf-date><conf-loc>Virtual event</conf-loc><publisher-name>ACM</publisher-name><pub-id pub-id-type="doi">10.1145/3461702.3462563</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Aitken</surname><given-names>W</given-names> </name><name name-style="western"><surname>Abdalla</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rudie</surname><given-names>K</given-names> </name><name name-style="western"><surname>Stinson</surname><given-names>C</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ku</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>A</given-names> </name><name name-style="western"><surname>Srikumar</surname><given-names>V</given-names> </name></person-group><article-title>Collaboration or corporate capture? Quantifying NLP&#x2019;s reliance on industry artifacts and contributions</article-title><year>2024</year><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><publisher-name>Association for Computational Linguistics</publisher-name><fpage>3433</fpage><lpage>3448</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.188</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Young</surname><given-names>M</given-names> </name><name name-style="western"><surname>Katell</surname><given-names>M</given-names> </name><name name-style="western"><surname>Krafft</surname><given-names>PM</given-names> </name></person-group><article-title>Confronting power and corporate capture at the FAccT conference</article-title><year>2022</year><conf-name>FAccT &#x2019;22: Proceedings of the 2022 ACM Conference on Fairness, Accountability, and Transparency</conf-name><conf-date>Jun 21-24, 2022</conf-date><conf-loc>Seoul, South Korea</conf-loc><publisher-name>ACM</publisher-name><pub-id pub-id-type="doi">10.1145/3531146.3533194</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Templin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Fort</surname><given-names>S</given-names> </name><name name-style="western"><surname>Padmanabham</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Framework for bias evaluation in large language models in healthcare settings</article-title><source>NPJ Digit Med</source><year>2025</year><month>07</month><day>7</day><volume>8</volume><issue>1</issue><fpage>414</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01786-w</pub-id><pub-id pub-id-type="medline">40624264</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Minozzi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cinquini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gianola</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gonzalez-Lorenzo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banzi</surname><given-names>R</given-names> </name></person-group><article-title>The revised Cochrane risk of bias tool for randomized trials (RoB 2) showed low interrater reliability and challenges in its application</article-title><source>J Clin Epidemiol</source><year>2020</year><month>10</month><volume>126</volume><fpage>37</fpage><lpage>44</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.06.015</pub-id><pub-id pub-id-type="medline">32562833</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hartling</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hamm</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Milne</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Testing the risk of bias tool showed low reliability between individual reviewers and across consensus assessments of reviewer pairs</article-title><source>J Clin Epidemiol</source><year>2013</year><month>09</month><volume>66</volume><issue>9</issue><fpage>973</fpage><lpage>981</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2012.07.005</pub-id><pub-id pub-id-type="medline">22981249</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kalaycioglu</surname><given-names>I</given-names> </name><name name-style="western"><surname>Rioux</surname><given-names>B</given-names> </name><name name-style="western"><surname>Briard</surname><given-names>JN</given-names> </name><etal/></person-group><article-title>Inter-rater reliability of risk of bias tools for non-randomized studies</article-title><source>Syst Rev</source><year>2023</year><month>12</month><day>7</day><volume>12</volume><issue>1</issue><fpage>227</fpage><pub-id pub-id-type="doi">10.1186/s13643-023-02389-w</pub-id><pub-id pub-id-type="medline">38057883</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tomlinson</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>C</given-names> </name><name name-style="western"><surname>Davenport</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Common challenges and suggestions for risk of bias tool development: a systematic review of methodological studies</article-title><source>J Clin Epidemiol</source><year>2024</year><month>07</month><volume>171</volume><fpage>111370</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2024.111370</pub-id><pub-id pub-id-type="medline">38670243</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Search strategy for the large language model (LLM) patient education systematic literature review.</p><media xlink:href="jmir_v28i1e87057_app1.docx" xlink:title="DOCX File, 3124 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>PRISMA flow diagram of unpublished systematic literature review of studies evaluating large language models (LLMs) for patient-facing health information published between July 2013 and March 2024.</p><media xlink:href="jmir_v28i1e87057_app2.png" xlink:title="PNG File, 140 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Large language model (LLM) patient education systematic literature review list of studies reviewed.</p><media xlink:href="jmir_v28i1e87057_app3.pdf" xlink:title="PDF File, 115 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Studies used in the pilot validation set (n=16 studies).</p><media xlink:href="jmir_v28i1e87057_app4.pdf" xlink:title="PDF File, 81 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Pilot validation dataset. Sixteen papers were each assessed by 2 evaluators for a total of 32 evaluations.</p><media xlink:href="jmir_v28i1e87057_app5.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Alberta Quality Assessment Tool: Risk of Bias (AQAT:RoB).</p><media xlink:href="jmir_v28i1e87057_app6.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material></app-group></back></article>