<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="review-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e56532</article-id>
      <article-id pub-id-type="pmid">39499913</article-id>
      <article-id pub-id-type="doi">10.2196/56532</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Review</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Review</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>The Accuracy and Capability of Artificial Intelligence Solutions in Health Care Examinations and Certificates: Systematic Review and Meta-Analysis</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>de Azevedo Cardoso</surname>
            <given-names>Taiane</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bortkiewicz</surname>
            <given-names>Alicja</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Domingues</surname>
            <given-names>Nuno</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yao</surname>
            <given-names>Zonghai</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Waldock</surname>
            <given-names>William J</given-names>
          </name>
          <degrees>MBA, MBBCHIR</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3283-4096</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Joe</given-names>
          </name>
          <degrees>MBBS, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6040-2122</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Guni</surname>
            <given-names>Ahmad</given-names>
          </name>
          <degrees>MBBS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3265-8096</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Nabeel</surname>
            <given-names>Ahmad</given-names>
          </name>
          <degrees>MBBS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4730-4397</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Darzi</surname>
            <given-names>Ara</given-names>
          </name>
          <degrees>MBBS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7815-7989</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Ashrafian</surname>
            <given-names>Hutan</given-names>
          </name>
          <degrees>BSc, MBBS, MBA, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>Institute of Global Health Innovation</institution>
            <institution>Imperial College London</institution>
            <addr-line>10th Floor, Queen Elizabeth Queen Mother Building, Praed Street</addr-line>
            <addr-line>London</addr-line>
            <country>United Kingdom</country>
            <phone>44 07799871597</phone>
            <email>h.ashrafian@imperial.ac.uk</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1668-0672</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Imperial College London</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Institute of Global Health Innovation</institution>
        <institution>Imperial College London</institution>
        <addr-line>London</addr-line>
        <country>United Kingdom</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hutan Ashrafian <email>h.ashrafian@imperial.ac.uk</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>5</day>
        <month>11</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e56532</elocation-id>
      <history>
        <date date-type="received">
          <day>18</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>25</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>26</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>9</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©William J Waldock, Joe Zhang, Ahmad Guni, Ahmad Nabeel, Ara Darzi, Hutan Ashrafian. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 05.11.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e56532" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) have dominated public interest due to their apparent capability to accurately replicate learned knowledge in narrative text. However, there is a lack of clarity about the accuracy and capability standards of LLMs in health care examinations.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We conducted a systematic review of LLM accuracy, as tested under health care examination conditions, as compared to known human performance standards.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We quantified the accuracy of LLMs in responding to health care examination questions and evaluated the consistency and quality of study reporting. The search included all papers up until September 10, 2023, with all LLMs published in English journals that report clear LLM accuracy standards. The exclusion criteria were as follows: the assessment was not a health care exam, there was no LLM, there was no evaluation of comparable success accuracy, and the literature was not original research.The literature search included the following Medical Subject Headings (MeSH) terms used in all possible combinations: “artificial intelligence,” “ChatGPT,” “GPT,” “LLM,” “large language model,” “machine learning,” “neural network,” “Generative Pre-trained Transformer,” “Generative Transformer,” “Generative Language Model,” “Generative Model,” “medical exam,” “healthcare exam,” and “clinical exam.” Sensitivity, accuracy, and precision data were extracted, including relevant CIs.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The search identified 1673 relevant citations. After removing duplicate results, 1268 (75.8%) papers were screened for titles and abstracts, and 32 (2.5%) studies were included for full-text review. Our meta-analysis suggested that LLMs are able to perform with an overall medical examination accuracy of 0.61 (CI 0.58-0.64) and a United States Medical Licensing Examination (USMLE) accuracy of 0.51 (CI 0.46-0.56), while Chat Generative Pretrained Transformer (ChatGPT) can perform with an overall medical examination accuracy of 0.64 (CI 0.6-0.67).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>LLMs offer promise to remediate health care demand and staffing challenges by providing accurate and efficient context-specific information to critical decision makers. For policy and deployment decisions about LLMs to advance health care, we proposed a new framework called RUBRICC (Regulatory, Usability, Bias, Reliability [Evidence and Safety], Interoperability, Cost, and Codesign–Patient and Public Involvement and Engagement [PPIE]). This presents a valuable opportunity to direct the clinical commissioning of new LLM capabilities into health services, while respecting patient safety considerations.</p>
        </sec>
        <sec sec-type="trial registration">
          <title>Trial Registration</title>
          <p>OSF Registries osf.io/xqzkw; https://osf.io/xqzkw</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>LLM</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>health care exam</kwd>
        <kwd>narrative medical response</kwd>
        <kwd>health care examination</kwd>
        <kwd>clinical commissioning</kwd>
        <kwd>health services</kwd>
        <kwd>safety</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The advent of large language models (LLMs), such as Chat Generative Pretrained Transformer (ChatGPT; OpenAI), has generated extraordinary interest worldwide and transformed the landscape of artificial intelligence (AI). This foremost positioning of transformer models in the public and academic consciousness has been achieved by the remarkable ability of generative artificial intelligence (genAI) models to create new content with human-like semantics and syntax, alongside the capability to accurately replicate learned knowledge in narrative text. Numerous applications in medical research [<xref ref-type="bibr" rid="ref1">1</xref>], medical education [<xref ref-type="bibr" rid="ref1">1</xref>], clinical communication or consultation [<xref ref-type="bibr" rid="ref2">2</xref>], and even diagnosis and risk prediction tasks [<xref ref-type="bibr" rid="ref2">2</xref>] have been demonstrated to date. There is great positive potential for genAI across all of these pathways and great promise to relieve the increasing pressures and shortage of clinical expertise in health care systems worldwide [<xref ref-type="bibr" rid="ref2">2</xref>].</p>
      <p>The ability of genAI to answer medical examination questions is of particular interest. First, such examinations serve as the gateway for professional qualification. Written examination questions replicate complex clinical scenarios in narrative form and may include the possibility of multiple reasonable differential diagnoses (multiple choice) or require ranking of medically appropriate responses (single-best answer) according to not just clinical knowledge but also contextual decision-making and medical ethics. For decades, this type of examination has been the ultimate test of human clinical judgment and depth of knowledge. The performance of LLMs in this context has far-reaching implications for how medical education is delivered. Second, these expert-developed and expert-validated question-answer pairs are a coherent substitute for real-world training data written in narrative form and may serve to tune genAI models with a clinical consultation, communication, or diagnostic function. This is exemplified by Google’s use of medical examination questions to train and test Medical Patient Language Model 2 (Med-PaLM 2) [<xref ref-type="bibr" rid="ref3">3</xref>]. Finally, these same validated questions are a ready-made benchmark for assessing LLM capabilities in future clinical or medical education–related tasks.</p>
      <p>However, the use of LLMs is not without risk. They have a propensity to “hallucinate” false information and produce potentially dangerous inaccuracies [<xref ref-type="bibr" rid="ref4">4</xref>]. In addition, LLMs are created through a process of pretraining on vast existing text corpuses to enable a general understanding of syntax and semantics. Although models may undergo fine-tuning for particular tasks or domains, this process does not modify the underlying “learned” knowledge but adjusts weights to adapt the model’s outputs for a required context. As such, the underlying embedding of our current societal state means that models will also encode societal biases, which will certainly include biases seen in health care provision and outcomes [<xref ref-type="bibr" rid="ref5">5</xref>]. An understanding of how these problems manifest in real-world tasks is key to developing mitigations and to establish risks and benefits of the use of LLMs in different medical areas.</p>
      <p>We conducted a systematic review of LLM accuracy, as tested under health care examination conditions, as compared to known human performance standards. We assessed the reporting quality and risk of bias within existing studies and synthesized a discussion of pitfalls and performance concerns, as reported by study investigators. We discussed how the observed LLM performance impacts medical education and genAI-enabled clinical consultation and recommended a framework for the conduct of future research in this area. In response to this rapidly progressing field, we aimed to establish a baseline performance and quality standard for the current generation of LLMs in narrative medical response tasks.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>The systematic review was conducted according to a registered protocol and was reported according to the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) statement [<xref ref-type="bibr" rid="ref6">6</xref>]. The protocol was registered with the Open Science Framework (OSF) [<xref ref-type="bibr" rid="ref7">7</xref>], under the title “How Accurate are Artificial Intelligence LLMs When Applied to Healthcare Exams and Certificates?”, with the secondary research questions “What is the performance of LLM in comparison to required examination standards for humans?” and “What are the primary discovered weaknesses of LLM in addressing narrative health care examination scenarios that may be pertinent to real-world performance in clinical scenarios?”</p>
      </sec>
      <sec>
        <title>Eligibility Criteria</title>
        <p>The inclusion criteria were all papers up until September 10, 2023, published in English language journals that describe the use of AI solutions in health care examinations and certificates. As reflected in the Medical Subject Headings (MeSH) terms used, the authors screened the manuscripts for “artificial intelligence,” which could be described in the following possible ways: “ChatGPT,” “GPT,” “LLM,” “large language model,” “machine learning,” “neural network,” “Generative Pretrained Transformer,” “Generative Transformer,” “Generative Language Model,” or “Generative Model.” The exclusion criteria were as follows: the assessment was not a health care examination, there was no LLM, there was no evaluation of comparable success accuracy, and the literature was not original research (ie, commentary, editorials, reviews). We assessed LLMs, first, as applied to health care examinations and, by extension, as applied to clinical problems, including those encountered by individual patients and clinicians, and the likely impact on future medical education. We assessed the outcome of the accuracy of examination response performance and an intervention of the use of LLMs to answer narrative health care examination questions. The additional variable(s)/covariate(s) to consider were the name and country of medical examination; the “pass mark” and other score boundaries for each medical examination; the average and intervals of human performance for each medical examination that included benchmarks; the identity of LLMs; LLM characteristics, including parameter size; and any fine-tuning for the LLMs prior to testing.</p>
      </sec>
      <sec>
        <title>Information Sources</title>
        <p>The search included all papers up until September 10, 2023, at which point a preliminary search was conducted and piloting of the study selection process was commenced using MEDLINE/PubMed, CINAHL, ClinicalTrials.gov, Embase, and Google Scholar.</p>
      </sec>
      <sec>
        <title>Search Strategy</title>
        <p>The literature search included the following MeSH terms used in all possible combinations: “artificial intelligence,” “ChatGPT,” “GPT,” “LLM,” “large language model,” “machine learning”, “neural network”, “Generative Pre-trained Transformer”, “Generative Transformer,” “Generative Language Model,” “Generative Model,” “medical exam,” “healthcare exam,” and “clinical exam.” Two authors (WJW and AG) independently identified relevant studies, and any discrepancies were resolved by consensus with the help of a third author (HA)</p>
      </sec>
      <sec>
        <title>Selection Process</title>
        <p>Screening reliability and duplicate removal were maintained by 2 independent screeners reviewing abstracts (WJW and AG), with divergent screener decisions reconciled by a third master screener (HA). Abstracts were downloaded and screened in Covidence software [<xref ref-type="bibr" rid="ref8">8</xref>] using .rsi and .csv files. Two independent authors (WJW and AG) performed full-text manuscript screening following abstract screening, with discrepancies resolved by consultation with the lead author (HA).</p>
      </sec>
      <sec>
        <title>Data Collection, Data Items, and Data Synthesis</title>
        <p>Two reviewers (WJW and AG) extracted and synthesized comparative accuracy data from the reviews on Covidence. No automation tools were used. The 2 authors independently extracted data from relevant studies, and any discrepancies were resolved by consensus with the help of a third author (HA) Sensitivity, accuracy, and precision data were extracted, including relevant CIs. The meta-analysis pooling of aggregate data used the random-effects inverse-variance model with DerSimonian-Laird estimate of tau<sup>2</sup>. The software used to conduct the meta-analysis was Stata Statistical Software Release 15 (StataCorp).</p>
      </sec>
      <sec>
        <title>Risk-of-Bias Assessment and Reporting Bias Assessment</title>
        <p>The QUADAS-2 tool [<xref ref-type="bibr" rid="ref9">9</xref>] was used for the systematic evaluation and assessment of the risk of bias and concerns regarding answer accuracy for clinical examination questions. The evaluation enabled adjudication of the applicability and bias concerns regarding reference standards and training data selection. Two independent authors performed the risk-of-bias assessment, with discrepancies resolved by consultation with the lead author (HA).Results</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Study Screening</title>
        <p>Based on PRISMA guidelines, the search identified 1673 relevant citations. After removing duplicate results, 1268 (75.8%) papers were screened for titles and abstracts, and 32 (2.5%) [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref40">40</xref>] studies were included for full-text review (see <xref rid="figure1" ref-type="fig">Figure 1</xref> and Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <p>The LLMs represented in this systematic literature review were Flan-PaLM 2 [<xref ref-type="bibr" rid="ref3">3</xref>], Generative Pretrained Transformer (GPT)-Neo [<xref ref-type="bibr" rid="ref10">10</xref>], ChatGPT [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref35">35</xref>], Google Bard [<xref ref-type="bibr" rid="ref13">13</xref>], Bing Chat [<xref ref-type="bibr" rid="ref13">13</xref>], PubMedGPT (Stanford University) [<xref ref-type="bibr" rid="ref36">36</xref>], BioLinkBERT [<xref ref-type="bibr" rid="ref37">37</xref>] (BERT stands for Bidirectional Encoder Representations from Transformers), PubMedBERT [<xref ref-type="bibr" rid="ref38">38</xref>], Galactica [<xref ref-type="bibr" rid="ref39">39</xref>], and DRAGON (Deep Bidirectional Language-Knowledge Graph Pretraining) [<xref ref-type="bibr" rid="ref40">40</xref>]. All these models are commercial, except BioLinkBERT, GPT-Neo, and DRAGON. The majority of LLMs used in medical examination tasks were pretrained, closed source models, developed and released by commercial organizations, such as ChatGPT. There was no prompt engineering described by the majority of the studies [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref35">35</xref>] when using ChatGPT, but Kung et al [<xref ref-type="bibr" rid="ref12">12</xref>] and Gilson et al [<xref ref-type="bibr" rid="ref14">14</xref>] specifically introduced prompt engineering to mitigate concerns about model “hallucinations” [<xref ref-type="bibr" rid="ref41">41</xref>]. Stanford University’s PubMedGPT 2.7B [<xref ref-type="bibr" rid="ref36">36</xref>] is an LLM trained on PubMed abstracts and Pile. Flan-PaLM 2 [<xref ref-type="bibr" rid="ref3">3</xref>], PubMedGPT [<xref ref-type="bibr" rid="ref36">36</xref>], DRAGON [<xref ref-type="bibr" rid="ref40">40</xref>], BioLinkBERT [<xref ref-type="bibr" rid="ref37">37</xref>], Galactica [<xref ref-type="bibr" rid="ref39">39</xref>], PubMedBERT [<xref ref-type="bibr" rid="ref38">38</xref>], and GPT-Neo [<xref ref-type="bibr" rid="ref10">10</xref>] were all evaluated using the same 12,723 United States Medical Licensing Examination (USMLE) open source question dataset [<xref ref-type="bibr" rid="ref42">42</xref>]. BioLinkBERT [<xref ref-type="bibr" rid="ref37">37</xref>] is a self-supervised pretraining bidirectional system that leverages graph structures in PubMed. PubMedBERT [<xref ref-type="bibr" rid="ref38">38</xref>] is a BERT-style model trained on PubMed, while Galactica [<xref ref-type="bibr" rid="ref39">39</xref>] is a GPT-style model trained on scientific literature that is 44 times the size of PubMedGPT 2.7B [<xref ref-type="bibr" rid="ref36">36</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Study selection based on PRISMA guidelines. PRISMA: Preferred Reporting Items for Systematic Reviews and Meta-Analyses.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e56532_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Precision, Sensitivity, and Accuracy</title>
        <sec>
          <title>Precision</title>
          <p>When assessing the precision of LLMs in all examinations, 2 (6.3%) studies had an overall precision of 0.61 (CI 0.55-0.67) across 189 questions, with a tau<sup>2</sup> heterogeneity of 0.0018 and an <italic>I</italic><sup>2</sup> variation attributable to a heterogeneity of 99.6%.</p>
        </sec>
        <sec>
          <title>Sensitivity</title>
          <p>When assessing the sensitivity of LLMs in all examinations, 2 (6.3%) studies had an overall sensitivity of 1.00 (CI 1.00-1.00) across 189 questions, with a tau<sup>2</sup> heterogeneity of 0.0000 and an <italic>I</italic><sup>2</sup> variation attributable to a heterogeneity of 0%.</p>
        </sec>
        <sec>
          <title>Accuracy</title>
          <p>The overall LLM examination performance, USMLE accuracy, and ChatGPT accuracy were all evaluated by substudy meta-analysis, with question counts moderated for double-counting across multiple substudies. When assessing the accuracy of LLMs in all examinations, 47 substudies had an overall accuracy of 0.61 (CI 0.58-0.64) across 22,347 questions, with a tau<sup>2</sup> heterogeneity of 0.0088 and an <italic>I</italic><sup>2</sup> variation attributable to a heterogeneity of 100% (<xref ref-type="table" rid="table1">Table 1</xref> and <xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>LLM<sup>a</sup> meta-analysis substudies.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="620"/>
              <col width="190"/>
              <col width="160"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Study and substudies</td>
                  <td>Questions, n</td>
                  <td>Accuracy</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="2"> Alessandri Bonetti et al [<xref ref-type="bibr" rid="ref26">26</xref>]; IRANE (Italian Residency Admission National Exam)</td>
                  <td>140</td>
                  <td>0.87</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Angel et al [<xref ref-type="bibr" rid="ref20">20</xref>]</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td> Bard American Board of Anesthesiology (ABA)</td>
                  <td>1000</td>
                  <td>0.46</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>GPT<sup>b</sup>-3 ABA</td>
                  <td>1000</td>
                  <td>0.50</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>GPT-4 ABA</td>
                  <td>1000</td>
                  <td>0.80</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Beaulieu-Jones et al [<xref ref-type="bibr" rid="ref30">30</xref>]</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Data-B</td>
                  <td>112</td>
                  <td>0.68</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>SCORE (Surgical Council on Resident Education)</td>
                  <td>167</td>
                  <td>0.71</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Bolton et al [<xref ref-type="bibr" rid="ref36">36</xref>]</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>PubMedGPT</td>
                  <td>12,723</td>
                  <td>0.50</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT<sup>c</sup></td>
                  <td>1217</td>
                  <td>0.76</td>
                </tr>
                <tr valign="top">
                  <td colspan="2"> Flores-Cohaila et al [<xref ref-type="bibr" rid="ref29">29</xref>]; Peruvian National Licensing Medical Examination (PNLME)</td>
                  <td>180</td>
                  <td>0.86</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Gencer et al [<xref ref-type="bibr" rid="ref23">23</xref>]; Turkish ChatGPT</td>
                  <td>105</td>
                  <td>0.91</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Giannos et al [<xref ref-type="bibr" rid="ref18">18</xref>]; BioMedical Admissions Test (BMAT)</td>
                  <td>509</td>
                  <td>0.73</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Gilson et al [<xref ref-type="bibr" rid="ref14">14</xref>]</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT A</td>
                  <td>100</td>
                  <td>0.44</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT B</td>
                  <td>100</td>
                  <td>0.42</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT C</td>
                  <td>87</td>
                  <td>0.64</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT D</td>
                  <td>102</td>
                  <td>0.58</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Gu et al [<xref ref-type="bibr" rid="ref38">38</xref>]; PubMedBERT<sup>d</sup></td>
                  <td>12,723</td>
                  <td>0.38</td>
                </tr>
                <tr valign="top">
                  <td colspan="2"> Guerra et al [<xref ref-type="bibr" rid="ref24">24</xref>]; ChatGPT Self-Assessment Neurosurgery (SANS)</td>
                  <td>643</td>
                  <td>0.77</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Huang et al [<xref ref-type="bibr" rid="ref21">21</xref>]</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td> GPT-3 Radiation Oncology in-Training (TXIT)</td>
                  <td>300</td>
                  <td>0.62</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>GPT-4 TXIT</td>
                  <td>300</td>
                  <td>0.79</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Huang et al [<xref ref-type="bibr" rid="ref28">28</xref>]; University of Toronto Family Medicine Residency Progress Test (UTFMRPT)</td>
                  <td>108</td>
                  <td>0.82</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Humar et al [<xref ref-type="bibr" rid="ref17">17</xref>]; ChatGPT plastic surgery</td>
                  <td>1129</td>
                  <td>0.56</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Huynh et al [<xref ref-type="bibr" rid="ref32">32</xref>]; GPT urology</td>
                  <td>135</td>
                  <td>0.28</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Kufel et al [<xref ref-type="bibr" rid="ref31">31</xref>]; ChatGPT Polish radiology examination</td>
                  <td>120</td>
                  <td>0.52</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Kung et al [<xref ref-type="bibr" rid="ref12">12</xref>]; ChatGPT</td>
                  <td>376</td>
                  <td>0.60</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Mannam et al [<xref ref-type="bibr" rid="ref35">35</xref>]; ChatGPT SANS</td>
                  <td>427</td>
                  <td>0.67</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Morreel et al [<xref ref-type="bibr" rid="ref16">16</xref>]; ChatGPT Dutch</td>
                  <td>47</td>
                  <td>0.50</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Oh et al [<xref ref-type="bibr" rid="ref19">19</xref>]; ChatGPT Korean</td>
                  <td>280</td>
                  <td>0.76</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Oztermeli et al [<xref ref-type="bibr" rid="ref22">22</xref>]; GPT-3.5 medical specialty examination (MSE)</td>
                  <td>1177</td>
                  <td>0.71</td>
                </tr>
                <tr valign="top">
                  <td colspan="4">
                    <bold>Raimondi et al [<xref ref-type="bibr" rid="ref13">13</xref>]</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bard Fellowship of the Royal College of Physicians and Surgeons (Ophthalmology), or FRCOphth, part 1</td>
                  <td>48</td>
                  <td>0.63</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bard FRCOphth part 2</td>
                  <td>43</td>
                  <td>0.52</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bing Chat FRCOphth part 1</td>
                  <td>48</td>
                  <td>0.79</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Bing Chat FRCOphth part 2</td>
                  <td>43</td>
                  <td>0.83</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT-3.5 FRCOphth part 1</td>
                  <td>48</td>
                  <td>0.55</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>ChatGPT-3.5 FRCOphth part 2</td>
                  <td>43</td>
                  <td>0.50</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LLM chatbot FRCOphth part 1</td>
                  <td>48</td>
                  <td>0.66</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>LLM chatbot FRCOphth part 2</td>
                  <td>43</td>
                  <td>0.68</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Sharma et al [<xref ref-type="bibr" rid="ref11">11</xref>]; ChatGPT</td>
                  <td>119</td>
                  <td>0.58</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Singhal et al [<xref ref-type="bibr" rid="ref3">3</xref>]; Med-PaLM 2<sup>e</sup></td>
                  <td>12,723</td>
                  <td>0.60</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Skalidid et al [<xref ref-type="bibr" rid="ref34">34</xref>]; ChatGPT cardiology</td>
                  <td>340</td>
                  <td>0.59</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Strong et al [<xref ref-type="bibr" rid="ref15">15</xref>]; ChatGPT</td>
                  <td>28</td>
                  <td>0.69</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Taylor et al [<xref ref-type="bibr" rid="ref39">39</xref>]; Galactica</td>
                  <td>12,723</td>
                  <td>0.44</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Venigalla et al [<xref ref-type="bibr" rid="ref10">10</xref>]; GPT-Neo</td>
                  <td>12,723</td>
                  <td>0.33</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Wang et al [<xref ref-type="bibr" rid="ref25">25</xref>]; Chinese National Medical Licensing Examination (CNMLE)</td>
                  <td>360</td>
                  <td>0.47</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Weng et al [<xref ref-type="bibr" rid="ref27">27</xref>]; Taiwan Family Medicine Board Exam (TFMBE)</td>
                  <td>125</td>
                  <td>0.42</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Yasunaga et al [<xref ref-type="bibr" rid="ref37">37</xref>]; BioLinkBERT</td>
                  <td>12,723</td>
                  <td>0.45</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">Yasunaga et al [<xref ref-type="bibr" rid="ref40">40</xref>]; DRAGON<sup>f</sup></td>
                  <td>12,723</td>
                  <td>0.48</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>LLM: large language model.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>GPT: Generative Pretrained Transformer.</p>
              </fn>
              <fn id="table1fn3">
                <p><sup>c</sup>ChatGPT: Chat Generative Pretrained Transformer.</p>
              </fn>
              <fn id="table1fn4">
                <p><sup>d</sup>BERT: Bidirectional Encoder Representations from Transformers.</p>
              </fn>
              <fn id="table1fn5">
                <p><sup>e</sup>Med-PaLM 2: Medical Patient Language Model 2.</p>
              </fn>
              <fn id="table1fn6">
                <p><sup>f</sup>DRAGON: Deep Bidirectional Language-Knowledge Graph Pretraining.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Forest plot of the accuracy of LLM performance on all medical examinations. ABA: American Board of Anesthesiology; BERT: Bidirectional Encoder Representations from Transformers; ChatGPT: Chat Generative Pretrained Transformer; CNMLE: Chinese National Medical Licensing Examination; DRAGON: Deep Bidirectional Language-Knowledge Graph Pretraining; FRCOphth: Fellowship of the Royal College of Physicians and Surgeons (Ophthalmology); GPT: Generative Pretrained Transformer; IRANE: Italian Residency Admission National Exam; LLM: large language model; MSE: medical specialty examination; PaLM: Patient Language Model 2; PNLME: Peruvian National Licensing Medical Examination; PRE: Polish radiology examination; SANS: Self-Assessment Neurosurgery; SCORE: Surgical Council on Resident Education; TFMBE: Taiwan Family Medicine Board Exam; TXIT: Radiation Oncology in-Training; UTFMRPT: University of Toronto Family Medicine Residency Progress Test.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e56532_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>USMLE Accuracy</title>
          <p>When assessing the accuracy of LLMs in the USMLE, 14 substudies had an overall accuracy of 0.51 (CI 0.46-0.56) across 13,535 questions, with a tau<sup>2</sup> heterogeneity of 0.0080 and an <italic>I</italic><sup>2</sup> variation attributable to a heterogeneity of 100%.</p>
        </sec>
        <sec>
          <title>ChatGPT Accuracy</title>
          <p>When assessing the accuracy of ChatGPT on medical examinations, 32 substudies had an overall accuracy of 0.64 (CI 0.6-0.67) across 9824 questions, with a tau<sup>2</sup> heterogeneity of 0.0128 and an <italic>I</italic><sup>2</sup> variation attributable to a heterogeneity of 100%.</p>
        </sec>
        <sec>
          <title>Bias and Narrative Reporting</title>
          <p>Among the 32 studies that underwent QUADAS-2 [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref43">43</xref>] risk-of-bias evaluation (<xref rid="figure3" ref-type="fig">Figure 3</xref>), only 11 (24.4%) were eligible for meta-analysis. Overall, 10 (31.3%) studies were found to have high bias, 15 (46.9%) studies were found to have some concerns of bias, and 7 (21.9%) studies were found to have low bias. In addition, 3 (9.4%) studies referred to concerns about “hallucinations,” but none described the effect nor referred to softer themes, such as empathy. No studies evaluated bias systematically. None of the reviewed literature was systematic reviews, so a TRIPOD (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis) adherence to reporting standards analysis [<xref ref-type="bibr" rid="ref44">44</xref>] was not conducted.</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Risk-of-bias. ABA: American Board of Anesthesiology; BERT: Bidirectional Encoder Representations from Transformers; ChatGPT: Chat Generative Pretrained Transformer; CNMLE: Chinese National Medical Licensing Examination; DRAGON: Deep Bidirectional Language-Knowledge Graph Pretraining; FRCOphth: Fellowship of the Royal College of Physicians and Surgeons (Ophthalmology); GPT: Generative Pretrained Transformer; IRANE: Italian Residency Admission National Exam; LLM: large language model; MSE: medical specialty examination; PaLM: Patient Language Model 2; PNLME: Peruvian National Licensing Medical Examination; PRE: Polish radiology examination; SANS: Self-Assessment Neurosurgery; SCORE: Surgical Council on Resident Education; TFMBE: Taiwan Family Medicine Board Exam; TFMR: Toronto Family Medicine Residency; TXIT: Radiation Oncology in-Training.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e56532_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our meta-analysis suggests that LLMs are able to perform with an overall medical examination accuracy of 0.61 (CI 0.58-0.64) and a USMLE accuracy of 0.51 (CI 0.46-0.56), while ChatGPT can perform with an overall medical examination accuracy of 0.64 (CI 0.6-0.67). We quantified the accuracy of LLMs in responding to health care examination questions and evaluated the consistency and quality of study reporting. The majority of LLMs used in medical examination tasks were pretrained, closed source models, developed and released by commercial organizations, such as ChatGPT. However, we found that minimal research has explored bias, “hallucination,” and holistic evaluation of the LLMs themselves. Moreover, neither the risk of bias nor holistic evaluation frameworks exist for LLMs themselves.</p>
        <p>There are inherent challenges to integrating LLMs into the education and clinical decision support of human doctors. Use cases for LLMs include grading, detection, prediction, and content generation [<xref ref-type="bibr" rid="ref45">45</xref>], but the application of these capabilities to the sociocultural elements of medicine are complex. Doctors offer empathetic relationships and formulate clinical reasoning in a more transparent way than current LLMs, raising concerns that the introduction of LLMs will undermine doctor-patient rapport [<xref ref-type="bibr" rid="ref46">46</xref>] and trust in the ethical compliance of the health care system. LLMs can automate the generation of text content, which offers opportunities to enhance student answer marking and provide responsive learning assistant chat features [<xref ref-type="bibr" rid="ref45">45</xref>]. However, these features lack transparency, prompting distrust in decision-making [<xref ref-type="bibr" rid="ref47">47</xref>], and a lack of evidence generation around student engagement [<xref ref-type="bibr" rid="ref48">48</xref>]. Although these training and infrastructure hurdles must be overcome, there is immense potential for personalized learning experiences with augmented and virtual reality, alongside enhanced curriculum implementation [<xref ref-type="bibr" rid="ref49">49</xref>].</p>
        <p>Medical examinations are not the same as medical practice [<xref ref-type="bibr" rid="ref50">50</xref>]. The tests that are designed to confirm a human’s suitability to practice medicine independently may not be appropriate for an LLM; real-world practice involves greater pathophysiological complexity, diverse holistic care considerations, and important ethical accountability frameworks to ensure empathetic patient-centered health services. Here, we demonstrated LLM capabilities in question-and-answer tasks according to established international benchmarks. Single-best answer questions are designed to simulate clinical decision-making, but there is a lack of relevance of examination questions to real-world tasks [<xref ref-type="bibr" rid="ref5">5</xref>]. Current models are trained on an unregulated range of both narrow and broad data sets to perform tasks with translational evidence, which currently have unclear significance in clinical practice [<xref ref-type="bibr" rid="ref5">5</xref>]. LLMs are not yet ready to be a proxy for human education, as questions simplify and isolate scenarios in an imperfect representation of real situations encountered by clinicians. However, the success of LLMs may justify a reconfiguration or even a disruption of medical training. This might involve an initial move toward formative assessments in view of the limitations of summative assessments exposed by the success of LLMs in the USMLE [<xref ref-type="bibr" rid="ref3">3</xref>]; rather, when offered access to a hitherto untapped wealth of medical information, the role of the doctor may be able to provide judicious medical decisions when presented with intelligent and superintelligent LLM-generated treatment strategies.</p>
        <p>Virtual and remote learning opportunities will be enhanced by LLMs [<xref ref-type="bibr" rid="ref49">49</xref>], but bias, cost, and “hallucination” are the major obstacles to their application in health care. The definition of the threshold for acceptable clinical deployment varies across clinical scenarios and disease states due to the variation in the acceptable tolerance of error. LLMs are developed with parameters that reflect the established sociocultural inequalities in our society and can be perpetuated in LLMs without further intervention. Solutions such as LLM-focused data governance strategies within current and future guidelines and novel approaches, including the use of synthetic data, will likely be needed to ensure those underserved by current data collection pools are not discriminated against in the behavior of the LLMs [<xref ref-type="bibr" rid="ref51">51</xref>]. With estimates suggesting that US $5 million of graphical processing units (GPUs) [<xref ref-type="bibr" rid="ref52">52</xref>] are needed at minimum for 1 LLM, their impressive capabilities are unlikely to be ubiquitous across health systems, such as the UK National Health Service (NHS), and may exacerbate inequalities. Finally, there is an inherent danger of “hallucination” with LLMs, undermining the protection of patient data and accurate contributions to live clinical scenarios [<xref ref-type="bibr" rid="ref53">53</xref>].</p>
      </sec>
      <sec>
        <title>Study Limitations</title>
        <p>The studies failed to explore the main barriers to LLM implementation in clinical practice, including bias, “hallucinations,” usability, cost, and privacy. The extensive variation between studies in the terminology, methodology, outcome measures, and data interpretability could be explained by a lack of consensus on how to conduct and report LLM studies. We have concerns over the reliability of these studies and the small volume of eligible studies for comparison. The lack of consistency in accuracy reporting between studies obstructed evaluation of the relative strengths of each method. There is an inherent challenge in evaluating technology with substantial commercial potential due to producers’ understandable reluctance about publishing sensitive details that may enable reproducibility but undermine commercial advantage. Our review concentrated on health care examination LLM performance and so did not account for LLM capability in more generalist evaluations that may still have valuable insights for optimizing health care capabilities.</p>
      </sec>
      <sec>
        <title>Future Work</title>
        <p>For policy and deployment decisions of LLMs to advance health care, we propose a new framework called <italic>RUBRICC</italic> (Regulatory, Usability, Bias, Reliability [Evidence and Safety], Interoperability, Cost, Codesign–Patient and Public Involvement and Engagement [PPIE]). See <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
        <sec>
          <title>Regulatory</title>
          <p>LLMs have unique evaluation requirements. Medicines and Healthcare products Regulatory Agency (MHRA) device standards may categorize some clinical LLMs as type 2b devices [<xref ref-type="bibr" rid="ref54">54</xref>], although medical knowledge progression (eg, National Institute for Health and Care Excellence [NICE] guidelines) may require the recall of LLMs due to their capabilities being contained by period updates. Moreover, specific LLM standards for clinical commissioning are yet to be defined. It is important to forecast probable applications of LLMs, such as medical chatbots, clinical documentation, obtaining insurance preauthorization [<xref ref-type="bibr" rid="ref55">55</xref>], and reviewing research papers [<xref ref-type="bibr" rid="ref56">56</xref>]. Therefore, the regulatory responsibilities to patient safety and privacy will demand scrutiny on the grounds of LLMs’ complexity, hardware, privacy, and real-time adaptation [<xref ref-type="bibr" rid="ref55">55</xref>]. Developing rigorous and robust regulatory standards will require the commitment and input of key stakeholders, including clinicians, engineers, researchers, ethicists, health policymakers, and patients. Importantly, standards must be regularly adapted and revised to meet the rapidly advancing and evolving nature of LLMs.</p>
        </sec>
        <sec>
          <title>Usability</title>
          <p>Early adopter contexts will also likely be when the LLM is a clinical decision support tool integrated into various clinical contexts ranging from triage and differential diagnoses to imaging and medication decisions. Different geographies may apply these technologies differently, from the United States’ insurance-based federated health landscape, which will likely apply LLMs to local health systems, in contrast to national data connectivity, which offers en masse precision LLM use across specialties, systems, and care tiers, such as in Estonia or the United Kingdom’s NHS [<xref ref-type="bibr" rid="ref57">57</xref>]. Academia will also be impacted, with publication assistance accelerating the role of LLM-coauthored literature [<xref ref-type="bibr" rid="ref56">56</xref>].</p>
        </sec>
        <sec>
          <title>Bias</title>
          <p>The systematic review literature deals in terms of bias, which represents the content and function of an AI. The bias discussions in the included papers focused on the following variables: <italic>within-item anchoring bias</italic>, <italic>grounding bias</italic>, <italic>chain-of-thought bias</italic>, and <italic>demographic bias</italic>. By contrast, risk characterizes the contextual impact of an LLM in conversations that inform commissioning of generative medical AI and aligns with current regulatory frameworks for current and future AI tools [<xref ref-type="bibr" rid="ref56">56</xref>]. Singhal et al [<xref ref-type="bibr" rid="ref3">3</xref>] evaluated Med-PaLM 2 using the following LLM answer risk framework: <italic>more inaccurate/irrelevant information</italic>, <italic>omits more information</italic>, <italic>more evidence of demographic bias</italic>, <italic>greater extent of harm</italic>, <italic>greater likelihood of harm.</italic> A key consideration is the risk matrix of LLM errors. There are unique requirements for LLM reporting that do not easily map onto established criteria, such as the Standards for Reporting of Diagnostic Accuracy Study (STARD) 2015 checklist [<xref ref-type="bibr" rid="ref58">58</xref>], and can be incorporated into the upcoming STARD-AI [<xref ref-type="bibr" rid="ref59">59</xref>]. Associated challenges related to bias include “hallucination” and privacy, threatening the reliability of these LLM services.</p>
        </sec>
        <sec>
          <title>Reliability</title>
          <sec>
            <title>Evidence</title>
            <p>Differences in reference standards and thresholds for diagnostic accuracy make comparison of LLM studies difficult in this nascent field, undermining the pathway to integration into health systems. These problems can only be addressed by specific reporting standards for AI studies [<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>], with design accuracy to address issues of reproducibility, transparency, and efficacy [<xref ref-type="bibr" rid="ref61">61</xref>]. Further evidence is needed to develop reliable guidelines [<xref ref-type="bibr" rid="ref62">62</xref>]. We therefore await guidelines that accommodate LLM utility to enable higher-quality and more consistent reporting, which in turn will empower the MHRA and the Food and Drug Administration (FDA) to be able to evaluate LLM risk. Specifically, the development of AI-specific risk-of-bias tools, such as QUADAS-AI, will aid in establishing the risk of bias for evidence synthesis of clinical LLM studies, allowing clinically relevant conclusions to be drawn more confidently [<xref ref-type="bibr" rid="ref43">43</xref>].</p>
          </sec>
          <sec>
            <title>Safety</title>
            <p>Multidisciplinary secure data environments (SDEs) [<xref ref-type="bibr" rid="ref63">63</xref>] must be established with cybersecurity standards to assuage recognized concerns about AI manipulation and displacement of human welfare priorities [<xref ref-type="bibr" rid="ref64">64</xref>]. There remain established concerns about the regulated integration of LLMs into established clinical workstreams in view of “hallucination” concerns, which will require a quality management system to ensure compliance with best practices to mitigate risk to patients.</p>
          </sec>
        </sec>
        <sec>
          <title>Interoperability</title>
          <p>Although data flows in the NHS have been mapped [<xref ref-type="bibr" rid="ref65">65</xref>], there is a growing demand for infrastructural transformation to reduce data inequalities and avoid the digital exclusion of unrepresented and underprivileged groups. A particular challenge includes multimodal data linkages and interoperability with integration of LLM tools in multiple different scenarios across the health service. One must be careful to consider how secondary or primary care data might be used differently to inform population health tools.</p>
        </sec>
        <sec>
          <title>Cost</title>
          <p>The economic considerations for LLMs can be organized into procurement, data processing, housing and cloud storage, management, and usability costs. Training costs have declined around 80% on models similar to ChatGPT-3 over the past 3 years [<xref ref-type="bibr" rid="ref62">62</xref>]. The input cost is the number of tokens passed as prompts to the application programming interface (API), and the output cost is dependent on the number of tokens returned [<xref ref-type="bibr" rid="ref63">63</xref>]. Therefore, for medical free-text record summarization, there is a large input cost dominated by the high quantity of tokens for each prompt. Self-hosted LLMs incur cloud service costs to run the models; it is notable that ChatGPT-4 (32 context length) is priced at US $60 input cost (per million tokens) and US $120 output cost (per million tokens) [<xref ref-type="bibr" rid="ref66">66</xref>]. Further costs to consider include fine-tuning, which is most effective in improving performance on low-parameter models [<xref ref-type="bibr" rid="ref67">67</xref>]; the clinical commissioning decisions related to these costs will be linked to the quality-adjusted life years (QALYs) associated with incremental performance improvements.</p>
        </sec>
        <sec>
          <title>Codesign-PPIE</title>
          <p>Public trust in LLMs can be built through a codesign process, adhering to INVOLVE [<xref ref-type="bibr" rid="ref68">68</xref>] values, through respect, support, transparency, responsiveness, fairness of opportunity, and accountability. AI raises challenges for the codesign processes due to the disproportionate emphasis on procedures, patients lacking genuine understanding, and concerns AI may exacerbate inequalities; this is best resolved by a focus on sociotechnical values and design humility to acknowledge to patients what the proposed technology cannot achieve for them [<xref ref-type="bibr" rid="ref69">69</xref>]. Meanwhile, doctor-patient rapport will likely be enhanced due to LLMs alleviating administrative tasks and helping clinicians answer patient questions [<xref ref-type="bibr" rid="ref70">70</xref>]. RUBRICC is a nascent area of work that will undergo further development to enable utility and impact in the field.</p>
        </sec>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>LLMs offer promise to remediate health care demand and staffing challenges by providing accurate and efficient context-specific information to critical decision makers. However, progress is obstructed by inconsistent reporting and an imbalance of resources between commercial interests and public sector regulators to independently evaluate potential LLM services. The ability of LLMs to pass the USMLE does not mean that the models answer useful questions to practicing clinicians [<xref ref-type="bibr" rid="ref71">71</xref>]. Although initial results show impressive accuracy in isolated studies, there is an immediate need for a framework, such as RUBRICC, to evaluate this emergent technology and facilitate robust clinical commissioning decisions to benefit patients.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Summary of studies selected.</p>
        <media xlink:href="jmir_v26i1e56532_app1.docx" xlink:title="DOCX File , 33 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>RUBRICC (Regulatory, Usability, Bias, Reliability [Evidence and Safety], Interoperability, Cost, Codesign–Patient and Public Involvement and Engagement [PPIE]), a framework for LLM clinical policy decisions. AI: artificial intelligence; LLM: large language model.</p>
        <media xlink:href="jmir_v26i1e56532_app2.png" xlink:title="PNG File , 216 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) checklist.</p>
        <media xlink:href="jmir_v26i1e56532_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 1786 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>Bidirectional Encoder Representations from Transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">ChatGPT</term>
          <def>
            <p>Chat Generative Pretrained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DRAGON</term>
          <def>
            <p>Deep Bidirectional Language-Knowledge Graph Pretraining</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">genAI</term>
          <def>
            <p>generative artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GPT</term>
          <def>
            <p>Generative Pretrained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">Med-PaLM 2</term>
          <def>
            <p>Medical Patient Language Model 2</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">MeSH</term>
          <def>
            <p>Medical Subject Headings</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">NHS</term>
          <def>
            <p>National Health Service</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">PRISMA</term>
          <def>
            <p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">RUBRICC</term>
          <def>
            <p>Regulatory, Usability, Bias, Reliability [Evidence and Safety], Interoperability, Cost, Codesign–Patient and Public Involvement and Engagement (PPIE)</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb13">STARD</term>
          <def>
            <p>Standards for Reporting of Diagnostic Accuracy Study</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb14">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The authors declare that all the data included in this study are available within the paper and multimedia appendices. Further information on research design is available in the Nature Research Reporting Summary linked to this paper.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>WJW was responsible for conceptualization, data curation, formal analysis, investigation, methodology, project administration, resources, and writing—original draft; JZ, AN, and AD for conceptualization, supervision, and writing—review and editing; AG for methodology and data curation; and HA for conceptualization, methodology, supervision, and writing—review and editing. No generative AI was used in any portion of the manuscript writing.</p>
      </fn>
      <fn fn-type="conflict">
        <p>Covidence software was used with funding from the Imperial Healthcare National Health Service (NHS) Trust and Imperial College London. JZ is funded by the Wellcome Trust (grant number 203928/Z/16/Z). AD is chair for the Preemptive Medicine and Health Security Initiative at Flagship Pioneering. HA is chief scientific officer, Preemptive Health and Medicine, Flagship Pioneering.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arora</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Arora</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Generative adversarial networks and synthetic patient data: current challenges and future perspectives</article-title>
          <source>Future Healthc J</source>
          <year>2022</year>
          <month>07</month>
          <volume>9</volume>
          <issue>2</issue>
          <fpage>190</fpage>
          <lpage>193</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35928184"/>
          </comment>
          <pub-id pub-id-type="doi">10.7861/fhj.2022-0013</pub-id>
          <pub-id pub-id-type="medline">35928184</pub-id>
          <pub-id pub-id-type="pii">futurehealth</pub-id>
          <pub-id pub-id-type="pmcid">PMC9345230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare</source>
          <year>2023</year>
          <month>03</month>
          <day>19</day>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>887</fpage>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sayres</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wulczyn</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>A responsible path to generative AI in healthcare</article-title>
          <source>Google Cloud</source>
          <year>2023</year>
          <month>4</month>
          <day>14</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cloud.google.com/blog/topics/healthcare-life-sciences/sharing-google-med-palm-2-medical-large-language-model">https://cloud.google.com/blog/topics/healthcare-life-sciences/sharing-google-med-palm-2-medical-large-language-model</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azamfirei</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kudchadkar</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Fackler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Large language models and the perils of their hallucinations</article-title>
          <source>Crit Care</source>
          <year>2023</year>
          <month>03</month>
          <day>21</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>120</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ccforum.biomedcentral.com/articles/10.1186/s13054-023-04393-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13054-023-04393-x</pub-id>
          <pub-id pub-id-type="medline">36945051</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13054-023-04393-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC10032023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wornow</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Thapa</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Steinberg</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fleming</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pfeffer</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Fries</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
          </person-group>
          <article-title>The shaky foundations of large language models and foundation models for electronic health records</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <month>07</month>
          <day>29</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>135</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00879-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00879-8</pub-id>
          <pub-id pub-id-type="medline">37516790</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00879-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC10387101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liberati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tetzlaff</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>DG</given-names>
            </name>
          </person-group>
          <article-title>Preferred Reporting Items for Systematic Reviews and Meta-Analyses: the PRISMA statement</article-title>
          <source>PLoS Med</source>
          <year>2009</year>
          <month>7</month>
          <day>21</day>
          <volume>6</volume>
          <issue>7</issue>
          <fpage>e1000097</fpage>
          <pub-id pub-id-type="doi">10.1371/journal.pmed.1000097</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <article-title>Systematic review and meta-analysis of the accuracy and capability of artificial intelligence solutions in healthcare exams and certificates</article-title>
          <source>Center for Open Science</source>
          <year>2023</year>
          <month>5</month>
          <day>26</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://osf.io/xqzkw">https://osf.io/xqzkw</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <article-title>The world's #1 systematic review tool</article-title>
          <source>Covidence</source>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.covidence.org">https://www.covidence.org</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>PF</given-names>
            </name>
            <name name-style="western">
              <surname>Rutjes</surname>
              <given-names>AWS</given-names>
            </name>
            <name name-style="western">
              <surname>Westwood</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Mallett</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Deeks</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Reitsma</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Leeflang</surname>
              <given-names>MMG</given-names>
            </name>
            <name name-style="western">
              <surname>Sterne</surname>
              <given-names>JAC</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PMM</given-names>
            </name>
            <collab>QUADAS-2 Group</collab>
          </person-group>
          <article-title>QUADAS-2: a revised tool for the quality assessment of diagnostic accuracy studies</article-title>
          <source>Ann Intern Med</source>
          <year>2011</year>
          <month>10</month>
          <day>18</day>
          <volume>155</volume>
          <issue>8</issue>
          <fpage>529</fpage>
          <lpage>536</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acpjournals.org/doi/abs/10.7326/0003-4819-155-8-201110180-00009?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.7326/0003-4819-155-8-201110180-00009</pub-id>
          <pub-id pub-id-type="medline">22007046</pub-id>
          <pub-id pub-id-type="pii">155/8/529</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Venigalla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Frankle</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Carbin</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>BioMedLM: a domain-specific large language model for biomedical text</article-title>
          <source>Stanford Center for Research on Foundation Models (CRFM) and MosaicML</source>
          <year>2022</year>
          <month>12</month>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mosaicml.com/blog/introducing-pubmed-gpt">https://www.mosaicml.com/blog/introducing-pubmed-gpt</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Thapa</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dhakal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Upadhaya</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Adhikari</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Khanal</surname>
              <given-names>SR</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: unlocking the potential of large language models for AI-assisted medical education</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2307.00112</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raimondi</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Tzoumas</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Salisbury</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Di Simplicio</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Romano</surname>
              <given-names>MR</given-names>
            </name>
            <collab>North East Trainee Research in Ophthalmology Network (NETRiON)</collab>
          </person-group>
          <article-title>Comparative analysis of large language models in the Royal College of Ophthalmologists fellowship exams</article-title>
          <source>Eye (Lond)</source>
          <year>2023</year>
          <month>12</month>
          <volume>37</volume>
          <issue>17</issue>
          <fpage>3530</fpage>
          <lpage>3533</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37161074"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41433-023-02563-3</pub-id>
          <pub-id pub-id-type="medline">37161074</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41433-023-02563-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC10686375</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>02</month>
          <day>08</day>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Strong</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>DiGiammarino</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Basaviah</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hosamani</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nevins</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kugler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hom</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>JH</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on free-response, clinical reasoning exams</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1101/2023.03.24.23287731"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.03.24.23287731</pub-id>
          <pub-id pub-id-type="medline">37034742</pub-id>
          <pub-id pub-id-type="pii">2023.03.24.23287731</pub-id>
          <pub-id pub-id-type="pmcid">PMC10081420</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Morreel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mathysen</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Verhoeven</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Aye, AI! ChatGPT passes multiple-choice family medicine exam</article-title>
          <source>Med Teach</source>
          <year>2023</year>
          <month>06</month>
          <volume>45</volume>
          <issue>6</issue>
          <fpage>665</fpage>
          <lpage>666</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159X.2023.2187684</pub-id>
          <pub-id pub-id-type="medline">36905610</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Humar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Asaad</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bengur</surname>
              <given-names>FB</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT is is equivalent to first year plastic surgery residents: evaluation of ChatGPT on the Plastic Surgery In-Service Exam</article-title>
          <source>Aesthet Surg J</source>
          <year>2023</year>
          <month>05</month>
          <day>04</day>
          <fpage>sjad130</fpage>
          <pub-id pub-id-type="doi">10.1093/asj/sjad130</pub-id>
          <pub-id pub-id-type="medline">37140001</pub-id>
          <pub-id pub-id-type="pii">7151262</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giannos</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Delardas</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on UK standardized admission tests: insights from the BMAT, TMUA, LNAT, and TSA examinations</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>04</month>
          <day>26</day>
          <volume>9</volume>
          <fpage>e47737</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e47737/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47737</pub-id>
          <pub-id pub-id-type="medline">37099373</pub-id>
          <pub-id pub-id-type="pii">v9i1e47737</pub-id>
          <pub-id pub-id-type="pmcid">PMC10173042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>WY</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT goes to the operating room: evaluating GPT-4 performance and its potential in surgical education and training in the era of large language models</article-title>
          <source>Ann Surg Treat Res</source>
          <year>2023</year>
          <month>05</month>
          <volume>104</volume>
          <issue>5</issue>
          <fpage>269</fpage>
          <lpage>273</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37179699"/>
          </comment>
          <pub-id pub-id-type="doi">10.4174/astr.2023.104.5.269</pub-id>
          <pub-id pub-id-type="medline">37179699</pub-id>
          <pub-id pub-id-type="pmcid">PMC10172028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Angel</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Rinehart</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Canneson</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Baldi</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Clinical knowledge and reasoning abilities of AI Large language models in anesthesiology: a comparative study on the ABA exam</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1101/2023.05.10.23289805"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.05.10.23289805</pub-id>
          <pub-id pub-id-type="medline">37292642</pub-id>
          <pub-id pub-id-type="pii">2023.05.10.23289805</pub-id>
          <pub-id pub-id-type="pmcid">PMC10246030</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gomaa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Semrau</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haderlein</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lettmaier</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weissmann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Grigo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tkhayat</surname>
              <given-names>HB</given-names>
            </name>
            <name name-style="western">
              <surname>Frey</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Gaipl</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Distel</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Maier</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fietkau</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bert</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Putz</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking ChatGPT-4 on a radiation oncology in-training exam and Red Journal Gray Zone cases: potentials and challenges for ai-assisted medical education and decision making in radiation oncology</article-title>
          <source>Front Oncol</source>
          <year>2023</year>
          <volume>13</volume>
          <fpage>1265024</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37790756"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fonc.2023.1265024</pub-id>
          <pub-id pub-id-type="medline">37790756</pub-id>
          <pub-id pub-id-type="pmcid">PMC10543650</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oztermeli</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Oztermeli</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performance in the medical specialty exam: an observational study</article-title>
          <source>Medicine (Baltimore)</source>
          <year>2023</year>
          <month>08</month>
          <day>11</day>
          <volume>102</volume>
          <issue>32</issue>
          <fpage>e34673</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37565917"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MD.0000000000034673</pub-id>
          <pub-id pub-id-type="medline">37565917</pub-id>
          <pub-id pub-id-type="pii">00005792-202308110-00076</pub-id>
          <pub-id pub-id-type="pmcid">PMC10419419</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gencer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Aydin</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT pass the thoracic surgery exam?</article-title>
          <source>Am J Med Sci</source>
          <year>2023</year>
          <month>10</month>
          <volume>366</volume>
          <issue>4</issue>
          <fpage>291</fpage>
          <lpage>295</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amjms.2023.08.001</pub-id>
          <pub-id pub-id-type="medline">37549788</pub-id>
          <pub-id pub-id-type="pii">S0002-9629(23)01292-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guerra</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Hofmann</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Sobhani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hofmann</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Soroudi</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hopkins</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Dallas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pangal</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Cheok</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>VN</given-names>
            </name>
            <name name-style="western">
              <surname>Mack</surname>
              <given-names>WJ</given-names>
            </name>
            <name name-style="western">
              <surname>Zada</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 artificial intelligence model outperforms ChatGPT, medical students, and neurosurgery residents on neurosurgery written board-like questions</article-title>
          <source>World Neurosurg</source>
          <year>2023</year>
          <month>11</month>
          <volume>179</volume>
          <fpage>e160</fpage>
          <lpage>e165</lpage>
          <pub-id pub-id-type="doi">10.1016/j.wneu.2023.08.042</pub-id>
          <pub-id pub-id-type="medline">37597659</pub-id>
          <pub-id pub-id-type="pii">S1878-8750(23)01144-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title>
          <source>J Med Syst</source>
          <year>2023</year>
          <month>08</month>
          <day>15</day>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id>
          <pub-id pub-id-type="medline">37581690</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-023-01961-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alessandri Bonetti</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Giorgino</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gallo Afflitto</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>De Lorenzi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Egro</surname>
              <given-names>FM</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the Italian Residency Admission National Exam compared to 15,869 medical graduates?</article-title>
          <source>Ann Biomed Eng</source>
          <year>2024</year>
          <month>04</month>
          <day>25</day>
          <volume>52</volume>
          <issue>4</issue>
          <fpage>745</fpage>
          <lpage>749</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03318-7</pub-id>
          <pub-id pub-id-type="medline">37490183</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03318-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hwang</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT failed Taiwan's Family Medicine Board Exam</article-title>
          <source>J Chin Med Assoc</source>
          <year>2023</year>
          <month>08</month>
          <day>01</day>
          <volume>86</volume>
          <issue>8</issue>
          <fpage>762</fpage>
          <lpage>766</lpage>
          <pub-id pub-id-type="doi">10.1097/JCMA.0000000000000946</pub-id>
          <pub-id pub-id-type="medline">37294147</pub-id>
          <pub-id pub-id-type="pii">02118582-990000000-00224</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>KJQ</given-names>
            </name>
            <name name-style="western">
              <surname>Meaney</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kemppainen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Punnett</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Leung</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Assessment of resident and AI chatbot performance on the University of Toronto Family Medicine Residency Progress Test: comparative study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>09</month>
          <day>19</day>
          <volume>9</volume>
          <fpage>e50514</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e50514/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50514</pub-id>
          <pub-id pub-id-type="medline">37725411</pub-id>
          <pub-id pub-id-type="pii">v9i1e50514</pub-id>
          <pub-id pub-id-type="pmcid">PMC10548315</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flores-Cohaila</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>García-Vicente</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vizcarra-Jiménez</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>De la Cruz-Galán</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Gutiérrez-Arratia</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Quiroga Torres</surname>
              <given-names>BG</given-names>
            </name>
            <name name-style="western">
              <surname>Taype-Rondan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on the Peruvian National Licensing Medical Examination: cross-sectional study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>09</month>
          <day>28</day>
          <volume>9</volume>
          <fpage>e48039</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48039/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48039</pub-id>
          <pub-id pub-id-type="medline">37768724</pub-id>
          <pub-id pub-id-type="pii">v9i1e48039</pub-id>
          <pub-id pub-id-type="pmcid">PMC10570896</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beaulieu-Jones</surname>
              <given-names>BR</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Berrigan</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Marwaha</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brat</surname>
              <given-names>GA</given-names>
            </name>
          </person-group>
          <article-title>Evaluating capabilities of large language models: performance of GPT4 on surgical knowledge assessments</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37502981"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.07.16.23292743</pub-id>
          <pub-id pub-id-type="medline">37502981</pub-id>
          <pub-id pub-id-type="pii">2023.07.16.23292743</pub-id>
          <pub-id pub-id-type="pmcid">PMC10371188</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kufel</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Paszkiewicz</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Bielówka</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bartnikowska</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Janik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stencel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Czogalik</surname>
              <given-names>Ł</given-names>
            </name>
            <name name-style="western">
              <surname>Gruszczyńska</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mielcarska</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Will ChatGPT pass the Polish specialty exam in radiology and diagnostic imaging? Insights into strengths and limitations</article-title>
          <source>Pol J Radiol</source>
          <year>2023</year>
          <volume>88</volume>
          <fpage>e430</fpage>
          <lpage>e434</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.5114/pjr.2023.131215"/>
          </comment>
          <pub-id pub-id-type="doi">10.5114/pjr.2023.131215</pub-id>
          <pub-id pub-id-type="medline">37808173</pub-id>
          <pub-id pub-id-type="pii">51387</pub-id>
          <pub-id pub-id-type="pmcid">PMC10551734</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huynh</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Bonebrake</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Schultis</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Quach</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Deibert</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>New artificial intelligence ChatGPT performa poorly on the 2022 Self-assessment Study Program for urology</article-title>
          <source>Urol Pract</source>
          <year>2023</year>
          <month>07</month>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>409</fpage>
          <lpage>415</lpage>
          <pub-id pub-id-type="doi">10.1097/UPJ.0000000000000406</pub-id>
          <pub-id pub-id-type="medline">37276372</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borchert</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hickman</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Pepys</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sadler</surname>
              <given-names>TJ</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on the Situational Judgement Test – a professional dilemma-based examination for doctors in the United Kingdom</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <month>08</month>
          <day>07</day>
          <volume>9</volume>
          <fpage>e48978</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48978/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48978</pub-id>
          <pub-id pub-id-type="medline">37548997</pub-id>
          <pub-id pub-id-type="pii">v9i1e48978</pub-id>
          <pub-id pub-id-type="pmcid">PMC10442724</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skalidid</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Cagnina</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luangphiphat</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mahendiran</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Muller</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Abbe</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fournier</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT takes on the European Exam in Core Cardiology: an artificial intelligence success story?</article-title>
          <source>Eur Heart J Digit Health</source>
          <year>2023</year>
          <month>4</month>
          <day>24</day>
          <volume>4</volume>
          <issue>3</issue>
          <fpage>279</fpage>
          <lpage>281</lpage>
          <comment>Erratum in: Eur Heart J Digit Health. 2023 May 17;4(4):357. doi: 10.1093/ehjdh/ztad034</comment>
          <pub-id pub-id-type="doi">10.1093/ehjdh/ztad029</pub-id>
          <pub-id pub-id-type="medline">37265864</pub-id>
          <pub-id pub-id-type="pii">ztad029</pub-id>
          <pub-id pub-id-type="pmcid">PMC10232281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mannam</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Subtirelu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chauhan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>HS</given-names>
            </name>
            <name name-style="western">
              <surname>Matache</surname>
              <given-names>IM</given-names>
            </name>
            <name name-style="western">
              <surname>Bryan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Chitta</surname>
              <given-names>SV</given-names>
            </name>
            <name name-style="western">
              <surname>Bathula</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Turlip</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wathen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ghenbot</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ajmera</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Blue</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>HI</given-names>
            </name>
            <name name-style="western">
              <surname>Ali</surname>
              <given-names>ZS</given-names>
            </name>
            <name name-style="western">
              <surname>Malhotra</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Srinivasan</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ozturk</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>JW</given-names>
            </name>
          </person-group>
          <article-title>Large language model-based neurosurgical evaluation matrix: a novel scoring criteria to assess the efficacy of ChatGPT as an educational tool for neurosurgery board preparation</article-title>
          <source>World Neurosurg</source>
          <year>2023</year>
          <month>12</month>
          <volume>180</volume>
          <fpage>e765</fpage>
          <lpage>e773</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/j.wneu.2023.10.043"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.wneu.2023.10.043</pub-id>
          <pub-id pub-id-type="medline">37839567</pub-id>
          <pub-id pub-id-type="pii">S1878-8750(23)01448-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bolton</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yasunaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Stanford CRFM introduces PubMedGPT 2.7B</article-title>
          <source>Stanford</source>
          <year>2022</year>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://hai.stanford.edu/news/stanford-crfm-introduces-pubmedgpt-27b">https://hai.stanford.edu/news/stanford-crfm-introduces-pubmedgpt-27b</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yasunaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>LinkBERT: pretraining language models with document links</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online 2022</comment>
          <pub-id pub-id-type="doi">10.18653/v1/2022.acl-long.551</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tinn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Usuyama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Domain-specific language model pretraining for biomedical natural language processing</article-title>
          <source>ACM Trans Comput Healthc</source>
          <year>2021</year>
          <month>10</month>
          <day>15</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1145/3458754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kardas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cucurull</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Scialom</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hartshorn</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Saravia</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Poulton</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Galactica: a large language model for science</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online 2022</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2211.09085</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yasunaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bosselut</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ren</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Liang</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Leskovec</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep bidirectional language-knowledge graph pretraining</article-title>
          <year>2022</year>
          <conf-name>NeurIPS 2022: 36th Conference on Neural Information Processing Systems</conf-name>
          <conf-date>November 28-December 9, 2022</conf-date>
          <conf-loc>New Orleans, LA</conf-loc>
          <fpage>37309</fpage>
          <lpage>37323</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moradi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Blagec</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Haberl</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Samwald</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>GPT-3 models are poor few-shot learners in the biomedical domain</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online 2021</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2109.02555</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Oufattole</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title>
          <source>Appl Sci</source>
          <year>2021</year>
          <month>07</month>
          <day>12</day>
          <volume>11</volume>
          <issue>14</issue>
          <fpage>6421</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3390/app11146421"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/app11146421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sounderajah</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ashrafian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Rose</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>NH</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Golub</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kahn Jr</surname>
              <given-names>CE</given-names>
            </name>
          </person-group>
          <article-title>A quality assessment tool for artificial intelligence-centered diagnostic test accuracy studies: QUADAS-AI</article-title>
          <source>Nat Med</source>
          <year>2021</year>
          <month>10</month>
          <volume>27</volume>
          <issue>10</issue>
          <fpage>1663</fpage>
          <lpage>1665</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41591-021-01517-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41591-021-01517-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <article-title>Adherence to Tripod</article-title>
          <source>Tripod</source>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tripod-statement.org/adherence/">https://www.tripod-statement.org/adherence/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sha</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Martinez-Maldonado</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Practical and ethical challenges of large language models in education: a systematic literature review</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.13379</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Civaner</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Uncu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Bulut</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chalil</surname>
              <given-names>EG</given-names>
            </name>
            <name name-style="western">
              <surname>Tatli</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in medical education: a cross-sectional needs assessment</article-title>
          <source>BMC Med Educ</source>
          <year>2022</year>
          <month>11</month>
          <day>09</day>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>772</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-022-03852-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-022-03852-3</pub-id>
          <pub-id pub-id-type="medline">36352431</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-022-03852-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC9646274</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masoumian Hosseini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Masoumian Hosseini</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Qayumi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmady</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Koohestani</surname>
              <given-names>HR</given-names>
            </name>
          </person-group>
          <article-title>The aspects of running artificial intelligence in emergency care; a scoping review</article-title>
          <source>Arch Acad Emerg Med</source>
          <year>2023</year>
          <volume>11</volume>
          <issue>1</issue>
          <fpage>e38</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37215232"/>
          </comment>
          <pub-id pub-id-type="doi">10.22037/aaem.v11i1.1974</pub-id>
          <pub-id pub-id-type="medline">37215232</pub-id>
          <pub-id pub-id-type="pmcid">PMC10197918</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grunhut</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Marques</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Wyatt</surname>
              <given-names>ATM</given-names>
            </name>
          </person-group>
          <article-title>Needs, challenges, and applications of artificial intelligence in medical education curriculum</article-title>
          <source>JMIR Med Educ</source>
          <year>2022</year>
          <month>06</month>
          <day>07</day>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>e35587</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2022/2/e35587/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/35587</pub-id>
          <pub-id pub-id-type="medline">35671077</pub-id>
          <pub-id pub-id-type="pii">v8i2e35587</pub-id>
          <pub-id pub-id-type="pmcid">PMC9214616</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mir</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Mir</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Raina</surname>
              <given-names>NT</given-names>
            </name>
            <name name-style="western">
              <surname>Mir</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Mir</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Miskeen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Alharthi</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Alamri</surname>
              <given-names>MMS</given-names>
            </name>
          </person-group>
          <article-title>Application of artificial intelligence in medical education: current scenario and future perspectives</article-title>
          <source>J Adv Med Educ Prof</source>
          <year>2023</year>
          <month>07</month>
          <volume>11</volume>
          <issue>3</issue>
          <fpage>133</fpage>
          <lpage>140</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37469385"/>
          </comment>
          <pub-id pub-id-type="doi">10.30476/JAMP.2023.98655.1803</pub-id>
          <pub-id pub-id-type="medline">37469385</pub-id>
          <pub-id pub-id-type="pii">JAMP-11-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC10352669</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ahuja</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>The impact of artificial intelligence in medicine on the future role of the physician</article-title>
          <source>PeerJ</source>
          <year>2019</year>
          <month>10</month>
          <day>4</day>
          <volume>7</volume>
          <fpage>e7702</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31592346"/>
          </comment>
          <pub-id pub-id-type="doi">10.7717/peerj.7702</pub-id>
          <pub-id pub-id-type="medline">31592346</pub-id>
          <pub-id pub-id-type="pii">7702</pub-id>
          <pub-id pub-id-type="pmcid">PMC6779111</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="web">
          <article-title>Synthetic data's role in LLM evolution</article-title>
          <source>Syntheticus</source>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tinyurl.com/ye2ud5sr">https://tinyurl.com/ye2ud5sr</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>CS</given-names>
            </name>
          </person-group>
          <article-title>What large models cost you – there is no free AI lunch</article-title>
          <source>Forbes</source>
          <year>2024</year>
          <month>1</month>
          <day>1</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.forbes.com/sites/craigsmith/2023/09/08/what-large-models-cost-you--there-is-no-free-ai-lunch/">https://www.forbes.com/sites/craigsmith/2023/09/08/what-large-models-cost-you--there-is-no-free-ai-lunch/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cath</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wachter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mittelstadt</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Taddeo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Floridi</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence and the ‘good society’: the US, EU, and UK approach</article-title>
          <source>Sci Eng Ethics</source>
          <year>2018</year>
          <month>04</month>
          <day>28</day>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>505</fpage>
          <lpage>528</lpage>
          <pub-id pub-id-type="doi">10.1007/s11948-017-9901-7</pub-id>
          <pub-id pub-id-type="medline">28353045</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11948-017-9901-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="web">
          <article-title>Section 5 - classification of general medical devices</article-title>
          <source>Medicines &#38; Healthcare products Regulatory Agency</source>
          <year>2022</year>
          <month>6</month>
          <day>26</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tinyurl.com/3tyra9t7">https://tinyurl.com/3tyra9t7</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meskó</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Topol</surname>
              <given-names>EJ</given-names>
            </name>
          </person-group>
          <article-title>The imperative for regulatory oversight of large language models (or generative AI) in healthcare</article-title>
          <source>NPJ Digit Med</source>
          <year>2023</year>
          <month>07</month>
          <day>06</day>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>120</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-023-00873-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-023-00873-0</pub-id>
          <pub-id pub-id-type="medline">37414860</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-023-00873-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC10326069</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conroy</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>How ChatGPT and other AI tools could disrupt scientific publishing</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>10</month>
          <day>01</day>
          <volume>622</volume>
          <issue>7982</issue>
          <fpage>234</fpage>
          <lpage>236</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-03144-w</pub-id>
          <pub-id pub-id-type="medline">37817033</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-03144-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="web">
          <article-title>European approach to artificial intelligence</article-title>
          <source>European Commission</source>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://digital-strategy.ec.europa.eu/en/policies/european-approach-artificial-intelligence">https://digital-strategy.ec.europa.eu/en/policies/european-approach-artificial-intelligence</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Reitsma</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Bruns</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Gatsonis</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Glasziou</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Irwig</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lijmer</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rennie</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>de Vet</surname>
              <given-names>HCW</given-names>
            </name>
            <name name-style="western">
              <surname>Kressel</surname>
              <given-names>HY</given-names>
            </name>
            <name name-style="western">
              <surname>Rifai</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Golub</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Hooft</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Korevaar</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>JF</given-names>
            </name>
            <collab>STARD Group</collab>
          </person-group>
          <article-title>STARD 2015: an updated list of essential items for reporting diagnostic accuracy studies</article-title>
          <source>BMJ</source>
          <year>2015</year>
          <month>10</month>
          <day>28</day>
          <volume>351</volume>
          <fpage>h5527</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.bmj.com/lookup/pmidlookup?view=long&#38;pmid=26511519"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.h5527</pub-id>
          <pub-id pub-id-type="medline">26511519</pub-id>
          <pub-id pub-id-type="pmcid">PMC4623764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sounderajah</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ashrafian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Aggarwal</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>De Fauw</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Denniston</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Greaves</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Markar</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>MDF</given-names>
            </name>
            <name name-style="western">
              <surname>Panch</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Pearson-Stuttard</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
            <name name-style="western">
              <surname>Golub</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Darzi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Developing specific reporting guidelines for diagnostic accuracy studies assessing AI interventions: the STARD-AI Steering Group</article-title>
          <source>Nat Med</source>
          <year>2020</year>
          <month>06</month>
          <day>08</day>
          <volume>26</volume>
          <issue>6</issue>
          <fpage>807</fpage>
          <lpage>808</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-020-0941-1</pub-id>
          <pub-id pub-id-type="medline">32514173</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-020-0941-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Rivera</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Calvert</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Denniston</surname>
              <given-names>AK</given-names>
            </name>
            <collab>SPIRIT-AI and CONSORT-AI Working Group</collab>
          </person-group>
          <article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI Extension</article-title>
          <source>BMJ</source>
          <year>2020</year>
          <month>09</month>
          <day>09</day>
          <volume>370</volume>
          <fpage>m3164</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.bmj.com/lookup/pmidlookup?view=long&#38;pmid=32909959"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.m3164</pub-id>
          <pub-id pub-id-type="medline">32909959</pub-id>
          <pub-id pub-id-type="pmcid">PMC7490784</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vollmer</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mateen</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Bohner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Király</surname>
              <given-names>FJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ghani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jonsson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cumbers</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jonas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>McAllister</surname>
              <given-names>KSL</given-names>
            </name>
            <name name-style="western">
              <surname>Myles</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Granger</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Birse</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Branson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Moons</surname>
              <given-names>KGM</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Ioannidis</surname>
              <given-names>JPA</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hemingway</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Machine learning and artificial intelligence research for patient benefit: 20 critical questions on transparency, replicability, ethics, and effectiveness</article-title>
          <source>BMJ</source>
          <year>2020</year>
          <month>03</month>
          <day>20</day>
          <volume>368</volume>
          <fpage>l6927</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.bmj.com/lookup/pmidlookup?view=long&#38;pmid=32198138"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.l6927</pub-id>
          <pub-id pub-id-type="medline">32198138</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ngiam</surname>
              <given-names>KY</given-names>
            </name>
            <name name-style="western">
              <surname>Khor</surname>
              <given-names>IW</given-names>
            </name>
          </person-group>
          <article-title>Big data and machine learning algorithms for health-care delivery</article-title>
          <source>Lancet Oncol</source>
          <year>2019</year>
          <month>05</month>
          <volume>20</volume>
          <issue>5</issue>
          <fpage>e262</fpage>
          <lpage>e273</lpage>
          <pub-id pub-id-type="doi">10.1016/s1470-2045(19)30149-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="web">
          <article-title>Secure data environment for NHS health and social care data - policy guidelines</article-title>
          <source>Department of Health &#38; Social Care</source>
          <year>2022</year>
          <month>12</month>
          <day>23</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.gov.uk/government/publications/secure-data-environment-policy-guidelines/secure-data-environment-for-nhs-health-and-social-care-data-policy-guidelines">https://www.gov.uk/government/publications/secure-data-environment-policy-guidelines/secure-data-environment-for-nhs-health-and-social-care-data-policy-guidelines</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="web">
          <article-title>AI concerns: manipulating humans, or even replacing them</article-title>
          <source>MIT Sloan</source>
          <year>2023</year>
          <month>5</month>
          <day>23</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tinyurl.com/28jemrmy">https://tinyurl.com/28jemrmy</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Morley</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gallifant</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Oddy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Teo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ashrafian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Delaney</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Darzi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Mapping and evaluating national data flows: transparency, privacy, and guiding infrastructural transformation</article-title>
          <source>Lancet Digital Health</source>
          <year>2023</year>
          <month>10</month>
          <volume>5</volume>
          <issue>10</issue>
          <fpage>e737</fpage>
          <lpage>e748</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1016/S2589-7500(23)00157-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/s2589-7500(23)00157-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="web">
          <article-title>The economics of large language models. The cost of ChatGPT-like search, training GPT-3, and a general framework for mapping the LLM cost trajectory</article-title>
          <source>Sunyan</source>
          <year>2023</year>
          <month>1</month>
          <day>21</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://sunyan.substack.com/p/the-economics-of-large-language-models">https://sunyan.substack.com/p/the-economics-of-large-language-models</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="web">
          <article-title>The $360K question about large language models economics</article-title>
          <source>TrueFoundry</source>
          <year>2023</year>
          <month>6</month>
          <day>22</day>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.truefoundry.com/blog/economics-of-large-language-models">https://www.truefoundry.com/blog/economics-of-large-language-models</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="web">
          <source>National Institute for Health and Care Research</source>
          <access-date>2024-10-08</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.nihr.ac.uk/ppi-patient-and-public-involvement-resources-applicants-nihr-research-programmes">https://www.nihr.ac.uk/ppi-patient-and-public-involvement-resources-applicants-nihr-research-programmes</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Donia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Co-design and ethical artificial intelligence for health: an agenda for critical research and practice</article-title>
          <source>Big Data Soc</source>
          <year>2021</year>
          <month>12</month>
          <day>17</day>
          <volume>8</volume>
          <issue>2</issue>
          <fpage>205395172110652</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1177/20539517211065248"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/20539517211065248</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Poliak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leas</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Faix</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Hogarth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title>
          <source>JAMA Intern Med</source>
          <year>2023</year>
          <month>06</month>
          <day>01</day>
          <volume>183</volume>
          <issue>6</issue>
          <fpage>589</fpage>
          <lpage>596</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37115527"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id>
          <pub-id pub-id-type="medline">37115527</pub-id>
          <pub-id pub-id-type="pii">2804309</pub-id>
          <pub-id pub-id-type="pmcid">PMC10148230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dash</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Thapa</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Banda</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Swaminathan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kashyap</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kotecha</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of GPT-3.5 and GPT-4 for supporting real-world information needs in healthcare delivery</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online 2023</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2304.13714</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
