<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="review-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e60807</article-id>
      <article-id pub-id-type="pmid">39052324</article-id>
      <article-id pub-id-type="doi">10.2196/60807</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Review</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Review</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of ChatGPT Across Different Versions in Medical Licensing Examinations Worldwide: Systematic Review and Meta-Analysis</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Jin</surname>
            <given-names>Qiao</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Haze</surname>
            <given-names>Tatsuya</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Pang</surname>
            <given-names>Jinyong</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Fangyuan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Mingxin</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Health Communication</institution>
            <institution>Graduate School of Medicine</institution>
            <institution>The University of Tokyo</institution>
            <addr-line>7-3-1 Hongo, Bunkyo</addr-line>
            <addr-line>Tokyo, 113-8655</addr-line>
            <country>Japan</country>
            <phone>81 03 5800 6549</phone>
            <fax>81 03 5689 0726</fax>
            <email>liumingxin98@g.ecc.u-tokyo.ac.jp</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6320-544X</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Okuhara</surname>
            <given-names>Tsuyoshi</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6251-3587</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Chang</surname>
            <given-names>XinYi</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8546-9494</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Shirabe</surname>
            <given-names>Ritsuko</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6427-210X</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Nishiie</surname>
            <given-names>Yuriko</given-names>
          </name>
          <degrees>MPH</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-9004-4182</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Okada</surname>
            <given-names>Hiroko</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7877-9753</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Kiuchi</surname>
            <given-names>Takahiro</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-5934-0681</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Health Communication</institution>
        <institution>Graduate School of Medicine</institution>
        <institution>The University of Tokyo</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Health Communication</institution>
        <institution>School of Public Health</institution>
        <institution>Graduate School of Medicine, The University of Tokyo</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Industrial Engineering and Economics</institution>
        <institution>School of Engineering</institution>
        <institution>Tokyo Institute of Technology</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Mingxin Liu <email>liumingxin98@g.ecc.u-tokyo.ac.jp</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>25</day>
        <month>7</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e60807</elocation-id>
      <history>
        <date date-type="received">
          <day>22</day>
          <month>5</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>11</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>15</day>
          <month>6</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Mingxin Liu, Tsuyoshi Okuhara, XinYi Chang, Ritsuko Shirabe, Yuriko Nishiie, Hiroko Okada, Takahiro Kiuchi. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 25.07.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e60807" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Over the past 2 years, researchers have used various medical licensing examinations to test whether ChatGPT (OpenAI) possesses accurate medical knowledge. The performance of each version of ChatGPT on the medical licensing examination in multiple environments showed remarkable differences. At this stage, there is still a lack of a comprehensive understanding of the variability in ChatGPT’s performance on different medical licensing examinations.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>In this study, we reviewed all studies on ChatGPT performance in medical licensing examinations up to March 2024. This review aims to contribute to the evolving discourse on artificial intelligence (AI) in medical education by providing a comprehensive analysis of the performance of ChatGPT in various environments. The insights gained from this systematic review will guide educators, policymakers, and technical experts to effectively and judiciously use AI in medical education.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We searched the literature published between January 1, 2022, and March 29, 2024, by searching query strings in Web of Science, PubMed, and Scopus. Two authors screened the literature according to the inclusion and exclusion criteria, extracted data, and independently assessed the quality of the literature concerning Quality Assessment of Diagnostic Accuracy Studies-2. We conducted both qualitative and quantitative analyses.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>A total of 45 studies on the performance of different versions of ChatGPT in medical licensing examinations were included in this study. GPT-4 achieved an overall accuracy rate of 81% (95% CI 78-84; <italic>P</italic>&#60;.01), significantly surpassing the 58% (95% CI 53-63; <italic>P</italic>&#60;.01) accuracy rate of GPT-3.5. GPT-4 passed the medical examinations in 26 of 29 cases, outperforming the average scores of medical students in 13 of 17 cases. Translating the examination questions into English improved GPT-3.5’s performance but did not affect GPT-4. GPT-3.5 showed no difference in performance between examinations from English-speaking and non–English-speaking countries (<italic>P</italic>=.72), but GPT-4 performed better on examinations from English-speaking countries significantly (<italic>P</italic>=.02). Any type of prompt could significantly improve GPT-3.5’s (<italic>P</italic>=.03) and GPT-4’s (<italic>P</italic>&#60;.01) performance. GPT-3.5 performed better on short-text questions than on long-text questions. The difficulty of the questions affected the performance of GPT-3.5 and GPT-4. In image-based multiple-choice questions (MCQs), ChatGPT’s accuracy rate ranges from 13.1% to 100%. ChatGPT performed significantly worse on open-ended questions than on MCQs.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>GPT-4 demonstrates considerable potential for future use in medical education. However, due to its insufficient accuracy, inconsistent performance, and the challenges posed by differing medical policies and knowledge across countries, GPT-4 is not yet suitable for use in medical education.</p>
        </sec>
        <sec sec-type="Trial Registration">
          <title>Trial Registration</title>
          <p>PROSPERO CRD42024506687; https://www.crd.york.ac.uk/prospero/display_record.php?RecordID=506687</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language model, ChatGPT, medical licensing examination, medical education</kwd>
        <kwd>LLMs</kwd>
        <kwd>NLP</kwd>
        <kwd>natural language processing</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>language models</kwd>
        <kwd>review methods</kwd>
        <kwd>systematic</kwd>
        <kwd>meta-analysis</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>In November 2022, the web-based artificial intelligence (AI) chatbot ChatGPT (OpenAI) was released to the public and swiftly garnered global attention because of its ability to provide detailed answers to complex queries [<xref ref-type="bibr" rid="ref1">1</xref>]. ChatGPT has been extensively applied across various domains, including programming, education, business, and law, with notable success in each [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Researchers have been actively exploring the potential roles and capabilities of ChatGPT in clinical diagnosis, health care, and medical education [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. The number of publications on this topic has increased dramatically since late 2022 [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Specifically, in medical education, ChatGPT can play several important roles, including, but not limited to, the following: First, compared to search engines like Google, which present a list of relevant pages, ChatGPT aims to provide concise and practical answers to users’ questions, making it an effective knowledge resource [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Second, in medical licensing examinations comprising multiple-choice questions (MCQs), ChatGPT can act as an “AI teaching assistant,” providing insights for each question, analyzing common errors, and reinforcing concepts interactively [<xref ref-type="bibr" rid="ref12">12</xref>]. Third, ChatGPT has the capability to analyze images. Although this feature is still in its early stages, it offers the potential for ChatGPT to serve as a “virtual mentor,” capable of analyzing medical images such as skin rashes and x-rays [<xref ref-type="bibr" rid="ref10">10</xref>]. Fourth, for most medical students who find it challenging to balance studying vast amounts of information, practicing evidence-based medicine, and fulfilling clinical duties, ChatGPT can provide concise summaries of clinical trials and generate key practical points from them [<xref ref-type="bibr" rid="ref10">10</xref>].</p>
        <p>However, a prerequisite for ChatGPT’s ability to help medical students in their studies and play a role in medical education, both now and in the future, is that ChatGPT has solid and accurate knowledge of medicine. Medical licensing examinations are a crucial part of the medical education pathway as they assess the readiness of aspiring doctors to enter clinical practice. These examinations vary in format and content across countries but typically test medical knowledge, clinical reasoning, and ethical decision-making [<xref ref-type="bibr" rid="ref13">13</xref>]. Over the past 2 years, researchers have used medical licensing examinations from various countries to test whether ChatGPT possesses accurate medical knowledge [<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref57">57</xref>].</p>
        <p>Although most of these studies used similar testing methods—inputting medical licensing examination questions into ChatGPT and recording the responses to calculate accuracy—the ChatGPT performance showed significant variation. A study conducted in the United States revealed that GPT-3.5 surpassed the 60% score threshold on the National Board of Medical Examiners (NBME)-free–Step-1 question, reaching the level of a third-year medical student [<xref ref-type="bibr" rid="ref21">21</xref>]. However, studies from South Korea, China, and Japan have indicated that GPT-3.5 failed to pass medical examinations in their respective countries [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>]. Although GPT-4 performed better overall than GPT-3.5 [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref47">47</xref>], it did not pass the Japanese medical licensing examination [<xref ref-type="bibr" rid="ref49">49</xref>]. In addition, ChatGPT performance varies significantly across medical specialties within these examinations [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>At this stage, there is still a lack of a comprehensive understanding of the variability in ChatGPT’s performance on different medical licensing examinations. We believe that prematurely using ChatGPT for clinical diagnosis and medical education without thoroughly evaluating its performance across various medical licensing examinations is irresponsible and could endanger human lives.</p>
      </sec>
      <sec>
        <title>Literature Review</title>
        <p>A total of 3 systematic reviews have explored ChatGPT’s performance in medical licensing examinations to the best of our knowledge [<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref60">60</xref>].</p>
        <p>A study from the United States collected literature up to June 2, 2023, focusing on various types of medical licensing examinations in the United States [<xref ref-type="bibr" rid="ref58">58</xref>]. Among the 19 included studies, only 2 were comprehensive medical licensing examinations, the United States Medical Licensing Examination (USMLE), while the remaining 17 were medical specialty examinations, such as plastic surgery, anesthesia, and ophthalmology [<xref ref-type="bibr" rid="ref58">58</xref>]. In contrast to this study, our research extends the literature collection to a global scale and examines the performance of ChatGPT in medical licensing examinations in different countries and languages. We believe that the worldwide perspective of the current review is crucial because medical education and licensure standards vary significantly across countries.</p>
        <p>A study from Pakistan collected literature up to April 2023, focusing on the performance of GPT-3.5 in various medical licensing examinations worldwide [<xref ref-type="bibr" rid="ref59">59</xref>]. However, with the advent of the more advanced GPT-4, more studies have focused on GPT-4. Our research includes all ChatGPT versions and discusses their performance differences.</p>
        <p>A study from China collected the literature up to July 15, 2023 [<xref ref-type="bibr" rid="ref60">60</xref>]. This study reviewed the performance of ChatGPT for various medical questions. Of the 60 included studies, only 3 were medical licensing examinations. In addition, this study created a framework to evaluate the quality of studies on the performance of large language models (LLMs) in medical questions [<xref ref-type="bibr" rid="ref60">60</xref>]. We slightly modified this evaluation framework and applied it to this study.</p>
      </sec>
      <sec>
        <title>Study Aims and Objectives</title>
        <p>This study reviewed all studies on ChatGPT’s performance in medical licensing examinations from January 1, 2022, to March 29, 2024, to clarify the following issues:</p>
        <list list-type="order">
          <list-item>
            <p>Can ChatGPT pass the medical licensing examinations?</p>
          </list-item>
          <list-item>
            <p>How does ChatGPT’s performance compare to that of medical students?</p>
          </list-item>
          <list-item>
            <p>How did ChatGPT perform in different languages?</p>
          </list-item>
          <list-item>
            <p>What is the relationship between question difficulty and ChatGPT’s performance?</p>
          </list-item>
          <list-item>
            <p>What is the relationship between question length and ChatGPT’s performance?</p>
          </list-item>
          <list-item>
            <p>How did ChatGPT perform on image-based MCQ?</p>
          </list-item>
          <list-item>
            <p>How did ChatGPT perform on open-ended questions?</p>
          </list-item>
          <list-item>
            <p>What is the difference in ChatGPT’s performance with and without prompts?</p>
          </list-item>
          <list-item>
            <p>Comparison of GPT-3.5’s and GPT-4’s performances.</p>
          </list-item>
          <list-item>
            <p>How does ChatGPT perform in medical licensing examinations in English-speaking countries and non–English-speaking countries?</p>
          </list-item>
        </list>
        <p>By comprehensively evaluating the accuracy of the medical knowledge held by ChatGPT, we integrate these perspectives and offer comprehensive recommendations for applying ChatGPT in medical education.</p>
        <p>Overall, this systematic review aimed to fill the knowledge gap regarding the application of ChatGPT in medical licensing examinations. Further, it sought to contribute to the evolving discourse on AI in medical education and facilitate future developments and applications in this field. The insights gained from this systematic review will guide educators, policymakers, and technical experts to effectively and judiciously use AI in medical education.</p>
        <p>To the best of our knowledge, this is the first study to comprehensively review the performance of all versions of ChatGPT on medical licensing examinations across different countries.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>This systematic review followed the PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) flow diagrams and guidance [<xref ref-type="bibr" rid="ref61">61</xref>]. This systematic review was registered in the PROSPERO (International Prospective Register of Systematic Reviews) database on February 1, 2024 (CRD42024506687).</p>
      <sec>
        <title>Search Strategy</title>
        <p>We searched for specific query strings (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) using the advanced search function in PubMed, Web of Science, and Scopus, with Google Scholar as a supplementary source. Literature published from January 1, 2022, to March 29, 2024, was included in the literature search. The literatures exported from these 3 platforms were imported into Rayyan [<xref ref-type="bibr" rid="ref62">62</xref>]. Two authors (ML and XC) independently screened the titles and abstracts of the retrieved studies using a search strategy to identify those that met the inclusion and exclusion criteria (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>). The full texts of these studies were then retrieved and independently assessed for eligibility by 2 authors. Any disagreements regarding the eligibility of specific studies were discussed and resolved by a third reviewer (TO). In addition to the database searches, we searched Google Scholar for triangulations on March 29, 2024. When the preprint and peer-reviewed literature data were identical, we included the peer-reviewed literature in our analysis. As part of the screening process, we recorded the reasons for study exclusion and presented them in a prismatic flow diagram.</p>
        <boxed-text id="box1" position="float">
          <title>Inclusion and exclusion criteria.</title>
          <p>
            <bold>Inclusion criteria</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>The study tested the performance of ChatGPT in medical licensing examinations.</p>
            </list-item>
            <list-item>
              <p>Any type of original research literature (peer-reviewed papers, conference papers, preprints, letters, books, etc).</p>
            </list-item>
            <list-item>
              <p>Literature published from 2022 to 2024.</p>
            </list-item>
            <list-item>
              <p>Literature on the performance of ChatGPT in all languages.</p>
            </list-item>
            <list-item>
              <p>Literature on any version of ChatGPT.</p>
            </list-item>
            <list-item>
              <p>Literature on multiple-choice questions, open-ended questions, and all other types of questions for medical licensing examinations.</p>
            </list-item>
          </list>
          <p>
            <bold>Exclusion criteria</bold>
          </p>
          <list list-type="bullet">
            <list-item>
              <p>Nonnational-level medical licensing examination.</p>
            </list-item>
            <list-item>
              <p>Examinations other than comprehensive medical licensing examinations (eg, medical final examinations at universities, medical questions created by the authors themselves, and medical specialty examinations).</p>
            </list-item>
            <list-item>
              <p>Studies that are not related to ChatGPT.</p>
            </list-item>
            <list-item>
              <p>Duplicate studies.</p>
            </list-item>
            <list-item>
              <p>Studies that are not published in English.</p>
            </list-item>
            <list-item>
              <p>Systematic review.</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Data Extraction and Management</title>
        <p>Two reviewers (ML and XC) independently extracted data from the included studies into an Excel (Microsoft Corp) spreadsheet by 2 reviewers (ML and XC). The data were compared, and inconsistencies were resolved through consensus or by a third reviewer (TO). The general characteristics to be extracted include the following: (1) title, (2) authors, (3) publication year, (4) publication date, (5) type of publication, (6) country of the medical licensing examination, (7) name of the medical licensing examination, (8) ChatGPT version, (9) language in which ChatGPT was tested, (10) duration of the test, (11) type of questions, (12) counts of correct or total questions, (13) accuracy rate, (14) did ChatGPT pass the examination, (15) comparison between medical students, and (16) was a prompt used.</p>
      </sec>
      <sec>
        <title>Assessing the Risk of Bias in the Included Studies</title>
        <p>A previous study developed an LLM evaluation framework based on the Quality Assessment of Diagnostic Accuracy Studies-2 (QUADAS-2) [<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref63">63</xref>]. We modified and applied this evaluation framework in our study (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
        <p>Since this previous study collected papers on ChatGPT’s performance across all types of medical questions [<xref ref-type="bibr" rid="ref60">60</xref>], we modified the original framework, whereas our research focused on ChatGPT’s performance in medical licensing examinations. Specifically, we added 2 evaluation items, items 4 and 5, to address aspects specific to medical licensing examinations. We removed item 8 (are the questions individual stand-alone queries or a continuous conversation requiring multiple consecutive inquiries?) from the original evaluation framework, as it did not apply to this study.</p>
        <p>In our modified evaluation framework, “task generation,” “conversation structure,” and “evaluation” correspond to “patient selection,” “index test,” and “reference standard” in QUADAS-2, respectively. Items 2 and 7 correspond to “flow and timing” in QUADAS-2.</p>
      </sec>
      <sec>
        <title>Evidence Synthesis</title>
        <p>Our analysis focuses on GPT-3.5 and GPT-4.</p>
      </sec>
      <sec>
        <title>Qualitative Analyses</title>
        <p>We performed a comprehensive summary using narrative analysis and descriptive statistics for the contents of the included studies that were narrative or lacked sufficient data.</p>
      </sec>
      <sec>
        <title>Quantitative Analyses</title>
        <p>We used the raw correct and total data in each included study to calculate the accuracy rate. The calculation rules are as follows: if a study used 1 set of questions for repeated testing, the displayed accuracy rate is the average score of all attempts and the total number of questions in the set. If the study tested both the original language and translated English questions, the displayed accuracy rate was based on the scores from the original language examination questions. For studies tested with and without optimized prompts, the displayed accuracy rate was based on the scores without optimized prompts. In studies that included MCQs and open-ended questions, the displayed accuracy rate excluded scores from the open-ended questions.</p>
        <p>We conducted a meta-analysis of studies that tested ChatGPT using MCQs.</p>
        <p>The <italic>I</italic>² statistic was used to assess the effect of heterogeneity on the pooled results. When significant heterogeneity was present (<italic>I</italic>²&#62;50%), a random effects model was used; otherwise, a fixed effects model was used. Accuracy was reported with a 95% CI. The significance level was set at <italic>P</italic>&#60;.05. Meta-regression and subgroup analyses were conducted to examine the potential sources of heterogeneity and compare performances across different subgroups. A sensitivity analysis was conducted to assess the robustness of the meta-analysis results. Accuracy was reported with 95% CIs. The “metafor” and “meta” packages in R (version 4.4.0; R Core Team) were used for the meta-analysis, publication bias, and sensitivity analyses.</p>
        <p>In addition, we conducted post hoc power analysis for the random effects model results of each main group and subgroup. G*Power (version 3.1.9.7; Erdfelder, Faul, and Buchner) was used for the power analysis.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Literature Screening and Selection</title>
        <p>By searching the query strings in the Web of Science, Scopus, and PubMed, we retrieved 3698 papers from the Web of Science, 6354 papers from Scopus, and 2587 papers from PubMed. After excluding 3751 duplicate papers, 8888 papers remained. We excluded 278 non-English papers, leaving 8610 papers. After reading the abstracts of all 8610 papers, we excluded 8,377 studies that were completely irrelevant to this review, leaving 233 studies remaining.</p>
        <p>A total of 137 focused on ChatGPT’s performance in medical specialty examinations, 11 on dental licensing examinations, 6 on nursing examinations, 6 on pharmacist examinations, and 25 on other medical examinations (eg, university medical entrance examinations and university medical final examinations). Further, 2 were systematic reviews, 1 was about nonnational medical examinations, and 2 lacked the necessary information. These studies did not meet the inclusion criteria.</p>
        <p>We then performed a supplementary search using Google Scholar and added 2 preprint papers on March 29, 2024. Ultimately, 45 papers were included in this systematic review (<xref rid="figure1" ref-type="fig">Figure 1</xref>) [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref14">14</xref>-<xref ref-type="bibr" rid="ref57">57</xref>].</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) flow diagram.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60807_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Quality Assessment of Included Studies</title>
        <p>Two authors independently assessed the quality of the 45 studies using an evaluation framework, and any disagreements were resolved through discussion and consensus (<xref rid="figure2" ref-type="fig">Figure 2</xref>). The literature we collected tested ChatGPT’s performance using national medical licensing examinations comprising MCQs with standard answers. Consequently, items 13, 14, 15, and 21 pertain to evaluators were not mentioned in three-quarters of the included studies. Unlike open-ended questions, MCQs do not require multiple evaluators to adopt a double-anonymized approach to evaluate test results. Therefore, this does not increase the risk in the “reference standard” part.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Quality assessment of included studies using evaluation framework.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60807_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>For item 7, more than half of the studies did not specify the exact test dates. On November 6, 2023, OpenAI developers announced that the cutoff dates for ChatGPT versions 3.5 and 4 were updated from September 2021 to January 2022 and April 2023, respectively [<xref ref-type="bibr" rid="ref64">64</xref>]. We believe that if the cutoff date of ChatGPT is updated during the testing period, this might affect the consistency of ChatGPT’s performance before and after the update.</p>
        <p>For item 10, more than half of the studies did not specify whether a new chat session was used to test different questions. Conducting different questions in the same session might have affected the ChatGPT performance.</p>
        <p>For reasons above, in the risk of bias assessment, only 2 studies and 3 studies were rated as high risk in the “index test” and “flow and timing” categories, respectively (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Risk of bias.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="250"/>
            <col width="190"/>
            <col width="170"/>
            <col width="220"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td>Author (year) and reference</td>
                <td>Patient selection</td>
                <td>Index test</td>
                <td>Reference standard</td>
                <td>Flow and timing</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Alessandri Bonetti et al (2024) [<xref ref-type="bibr" rid="ref14">14</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Aljindan et al (2023) [<xref ref-type="bibr" rid="ref15">15</xref>]</td>
                <td>Unclear</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Armitage (2024) [<xref ref-type="bibr" rid="ref16">16</xref>]</td>
                <td>Unclear</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Ebrahimian et al (2023) [<xref ref-type="bibr" rid="ref17">17</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Fang et al (2023) [<xref ref-type="bibr" rid="ref18">18</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Flores-Cohaila et al (2023) [<xref ref-type="bibr" rid="ref19">19</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Garabet et al (2023) [<xref ref-type="bibr" rid="ref20">20</xref>]</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
                <td>High</td>
              </tr>
              <tr valign="top">
                <td>Gilson et al (2023) [<xref ref-type="bibr" rid="ref21">21</xref>]</td>
                <td>Unclear</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Gobira et al (2023) [<xref ref-type="bibr" rid="ref22">22</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Guillen-Grima et al (2023) [<xref ref-type="bibr" rid="ref23">23</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Haze et al (2023) [<xref ref-type="bibr" rid="ref24">24</xref>]</td>
                <td>Low</td>
                <td>High</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Huang et al (2024) [<xref ref-type="bibr" rid="ref25">25</xref>]</td>
                <td>Low</td>
                <td>High</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Jang et al (2023) [<xref ref-type="bibr" rid="ref26">26</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Jung et al (2023) [<xref ref-type="bibr" rid="ref27">27</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Kao et al (2024) [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Kataoka et al (2023) [<xref ref-type="bibr" rid="ref29">29</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Khorshidi et al (2023) [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Kleinig et al (2023) [<xref ref-type="bibr" rid="ref31">31</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Kleinig et al (2023) [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>High</td>
              </tr>
              <tr valign="top">
                <td>Knoedler et al (2024) [<xref ref-type="bibr" rid="ref33">33</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Kung et al (2023) [<xref ref-type="bibr" rid="ref3">3</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Lai et al (2023) [<xref ref-type="bibr" rid="ref34">34</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Lin et al (2024) [<xref ref-type="bibr" rid="ref35">35</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Meyer et al (2024) [<xref ref-type="bibr" rid="ref36">36</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Mihalache et al (2023) [<xref ref-type="bibr" rid="ref37">37</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Nakao et al (2024) [<xref ref-type="bibr" rid="ref38">38</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Oztermeli and Oztermeli (2023) [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Roos et al (2023) [<xref ref-type="bibr" rid="ref40">40</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Rosoł et al (2023) [<xref ref-type="bibr" rid="ref41">41</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Scaioli et al (2023) [<xref ref-type="bibr" rid="ref42">42</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Shang et al (2023) [<xref ref-type="bibr" rid="ref43">43</xref>]</td>
                <td>Unclear</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Takagi et al (2023) [<xref ref-type="bibr" rid="ref44">44</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Tong et al (2023) [<xref ref-type="bibr" rid="ref45">45</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Torres-Zegarra et al (2023) [<xref ref-type="bibr" rid="ref46">46</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Wang et al (2023) [<xref ref-type="bibr" rid="ref47">47</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Wang et al (2023) [<xref ref-type="bibr" rid="ref48">48</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Watari et al (2023) [<xref ref-type="bibr" rid="ref49">49</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Weng et al (2023) [<xref ref-type="bibr" rid="ref50">50</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Yanagita et al (2023) [<xref ref-type="bibr" rid="ref51">51</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Yaneva et al (2024) [<xref ref-type="bibr" rid="ref52">52</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
              <tr valign="top">
                <td>Zhu et al (2023) [<xref ref-type="bibr" rid="ref53">53</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Zong et al (2024) [<xref ref-type="bibr" rid="ref54">54</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Rojas et al (2024) [<xref ref-type="bibr" rid="ref55">55</xref>]</td>
                <td>Low</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>Unclear</td>
              </tr>
              <tr valign="top">
                <td>Kung et al (2023) [<xref ref-type="bibr" rid="ref56">56</xref>]</td>
                <td>Unclear</td>
                <td>Unclear</td>
                <td>Low</td>
                <td>High</td>
              </tr>
              <tr valign="top">
                <td>Keshtkar et al (2023) [<xref ref-type="bibr" rid="ref57">57</xref>]</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
                <td>Low</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>General Characteristics of Included Studies</title>
        <p>Among the 45 reviewed papers, the earliest was published on February 8, 2023 [<xref ref-type="bibr" rid="ref21">21</xref>], and the latest on April 30, 2024 [<xref ref-type="bibr" rid="ref55">55</xref>]. The general characteristics of the studies are shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
        <p>The medical licensing examinations applied to test ChatGPT’s performance were from 17 countries and regions: Italy (n=2), Saudi Arabia (n=1), the United Kingdom (n=2), Iran (n=3), China (n=7), Peru (n=2), the United States (n=7), Brazil (n=1), Spain (n=1), Japan (n=6), Taiwan (n=4), South Korea (n=1), Germany (n=3), Australia (n=2), Turkey (n=1), Poland (n=1), and Chile (n=1; <xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Countries where medical licensing examination was used to test ChatGPT.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60807_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Of the 45 included studies, 29 tested the performance of GPT-4, and 26 tested the performance of GPT-3.5. A total of 14 studies tested both GPT-4 and GPT-3.5. In addition, 4 studies tested the GPT-3, one tested the InstructGPT, and one tested the ChatGPT Plus.</p>
        <p>Regarding the countries and languages of the medical licensing examination questions used to test ChatGPT, 11 studies used examinations from an English-speaking country. Of the 34 medical licensing examinations of non–English-speaking countries, 22 used only the native language for testing, three translated the original language into English, and 9 used both the original and translated English questions.</p>
        <p>All 45 studies included MCQs, with 4 studies including open-ended questions, 1 study including calculation questions, and 1 study including patient history inquiry questions.</p>
      </sec>
      <sec>
        <title>Qualitative Analyses</title>
        <p>Regarding the performance of ChatGPT on passing the medical licensing examination, among the 26 studies testing GPT-3.5, 6 reported that GPT-3.5 passed the medical licensing examination, and 4 reported satisfactory performance, making up 38.5% (10/26) of the total. In the remaining studies, 1 was unclear, and 15 did not pass. Among the 29 studies testing GPT-4, 17 reported that GPT-4 passed the medical licensing examination, and 9 reported satisfactory performance, making up 89.7% (26/29) of the total. In the remaining studies, 1 was unclear and 2 did not pass (<xref rid="figure4" ref-type="fig">Figure 4</xref>). For the other ChatGPT models, among the 4 studies testing the GPT-3 performance, 2 did not pass, 1 was unclear, and 1 showed a satisfactory performance. The studies that tested GPT-4 with Vision (GPT-4V, which is specifically designed for image tasks), InstructGPT, and ChatGPT Plus showed the following results: passed, did not pass, and did not pass.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Performance of ChatGPT on passing the medical licensing examination.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60807_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Regarding the performance of ChatGPT compared with medical students, 14 of 45 studies compared GPT-3.5’s performance with medical students, and 17 of 45 compared GPT-4’s performance with that of medical students. Four studies showed that GPT-3.5 surpassed medical students, accounting for 28.6% (4/14) of the studies. A total of 13 studies showed that GPT-4 surpassed medical students, accounting for 76.5% (13/17) of the studies (<xref rid="figure5" ref-type="fig">Figure 5</xref>). For the other ChatGPT models, 1 study showed that GPT-3 surpassed medical students, while another showed that it performed worse. One study indicated that InstructGPT performed worse than the students.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Performance of ChatGPT compared with medical students.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e60807_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We also compared ChatGPT’s performance in the original language and English-translated questions of the same non-English medical licensing examination. In studies of medical licensing examinations in non–English-speaking countries, 9 used both the original language and English-translated questions to test ChatGPT’s performance, with 8 reporting comparative results (<xref ref-type="table" rid="table2">Table 2</xref>). Overall, for GPT-4, translating the original language into English had a limited effect on improving the performance. The accuracy improvement ranged from 0.17% to 8.65%, with 6 studies showing an accuracy increase of less than 5%. However, compared with GPT-4, GPT-3.5 showed significant improvement when tested in English in 4 studies. In 2 of these studies, GPT-3.5’s accuracy was more than 20% higher in English than in the original language.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>ChatGPT’s performance in original language and English-translated questions [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref57">57</xref>].</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="0"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Author (year) and reference</td>
                <td colspan="2">GPT-3.5 accuracy rate</td>
                <td colspan="3">GPT-4 accuracy rate</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Original language, n/n (%)</td>
                <td>English-translated, n/n (%)</td>
                <td>Original language, n/n (%)</td>
                <td colspan="2">English-translated, n/n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Fang et al (2023) [<xref ref-type="bibr" rid="ref18">18</xref>]</td>
                <td>Untested</td>
                <td>Untested</td>
                <td>197/260 (75.77%)</td>
                <td colspan="2">201/260 (77.31%)</td>
              </tr>
              <tr valign="top">
                <td>Guillen-Grima et al (2023) [<xref ref-type="bibr" rid="ref23">23</xref>]</td>
                <td>115/182 (63.2)</td>
                <td>121/182 (66.5)</td>
                <td>158/182 (86.8)</td>
                <td colspan="2">160/182 (87.9)</td>
              </tr>
              <tr valign="top">
                <td>Jang et al (2023) [<xref ref-type="bibr" rid="ref26">26</xref>]</td>
                <td>Untested</td>
                <td>Untested</td>
                <td>Unclear (51.8)</td>
                <td colspan="2">Unclear (60.5)</td>
              </tr>
              <tr valign="top">
                <td>Khorshidi et al (2023) [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
                <td>Untested</td>
                <td>Untested</td>
                <td>161/198 (81.3)</td>
                <td colspan="2">167/198 (84.3)</td>
              </tr>
              <tr valign="top">
                <td>Rosoł at al (2023) [<xref ref-type="bibr" rid="ref41">41</xref>]</td>
                <td>320.5/585 (54.8)</td>
                <td>353/585 (60.3)</td>
                <td>465.5/585 (79.6)</td>
                <td colspan="2">466.5/585 (79.7)</td>
              </tr>
              <tr valign="top">
                <td>Tong at al (2023) [<xref ref-type="bibr" rid="ref45">45</xref>]</td>
                <td>Untested</td>
                <td>Untested</td>
                <td>130/160 (81.3)</td>
                <td colspan="2">138/160 (86.3)</td>
              </tr>
              <tr valign="top">
                <td>Wang at al (2023) [<xref ref-type="bibr" rid="ref47">47</xref>]</td>
                <td>56/100 (56)</td>
                <td>76/100 (76)</td>
                <td>84/100 (84)</td>
                <td colspan="2">86/100 (86)</td>
              </tr>
              <tr valign="top">
                <td>Keshtkar at al (2023) [<xref ref-type="bibr" rid="ref57">57</xref>]</td>
                <td>394/1105 (35.7)</td>
                <td>687/1105 (61.4)</td>
                <td colspan="2">Untested</td>
                <td>Untested</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>A total of 2 and 3 studies examined the correlation between GPT-3.5 or GPT-4 performance and the length of the question text, respectively. Both studies on GPT-3.5 showed a significant correlation between performance and the length of the question text; the longer the question text, the poorer the performance of GPT-3.5 [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. In contrast, none of the 3 studies on GPT-4 found a significant difference in performance between long- and short-text questions [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref50">50</xref>].</p>
        <p>A total of 8 studies examined the correlation between the difficulty of the questions and ChatGPT’s performance. A total of 7 studies indicated that both GPT-4 and GPT-3.5 performed worse on difficult questions than easier ones [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Only 1 study showed that the difficulty of the questions did not affect GPT-4’s performance. However, in this study, the difficulty was subjectively rated by 3 medical students rather than using official difficulty ratings [<xref ref-type="bibr" rid="ref45">45</xref>].</p>
        <p>Regarding ChatGPT’s performance with and without optimized prompts, in our review of 45 papers, 13 stated that researchers provided ChatGPT with prompts before asking questions. Most of these prompts were designed to help ChatGPT better understand its task, such as “You are now an experienced clinician; please answer the following questions” or “You are a medical student, and we will be using medical licensing examination questions to test you; please provide your best answers.” Researchers have not analyzed or elaborated on the impact of these task understanding prompts on ChatGPT’s performance. However, 3 studies used optimized prompts [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. A Korean study used 4 kinds of optimized prompts, including annotating Chinese terms in traditional Korean medicine, translating the instruction and question into English, providing examination-optimized instructions, and using self-consistency in the prompt. The results showed that ChatGPT’s accuracy increased from 51.82% to 66.18% with optimized prompts [<xref ref-type="bibr" rid="ref26">26</xref>]. In the other 2 studies, questions that ChatGPT initially answered incorrectly without prompts were reasked with optimized prompts, such as “Are you sure? Pretend to be a junior doctor with expertise in clinical practice and examination solving and retry” or “Could you double-check the answer?” ChatGPT could correctly answer up to 88.9% and 84% of these questions, respectively [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. For task understanding prompts, we conducted a subgroup analysis and meta-regression to examine whether they affected ChatGPT’s performance.</p>
        <p>Regarding the capability of ChatGPT in answering image-based MCQs, 4 studies have reported the performance of ChatGPT in image-based MCQs. Three tested GPT-4, and 1 compared GPT-4 and GPT-4V [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. In a UK study, GPT-4 achieved an accuracy rate of 100% (3/3) for the image-based MCQs correctly [<xref ref-type="bibr" rid="ref16">16</xref>]. In a Spanish study, the accuracy rate of GPT-4 for image-based MCQs in Spanish was 13%, and the accuracy rate was 26% after translating the image-based MCQs into English, twice as high as in Spanish [<xref ref-type="bibr" rid="ref23">23</xref>]. Japanese researchers tested GPT-4’s performance on image-based MCQs that provided both images and text and on image-based MCQs that provided only text. The rate of correctness was 68% (73/108) when both images and text were provided, and 72% (78/108) when only text was provided [<xref ref-type="bibr" rid="ref38">38</xref>]. Researchers in Chile compared the performance of ChatGPT-4 and ChatGPT-4V in image-based MCQs. The accuracy rates of GPT-4 and GPT-4V for image-based MCQs were 76.7% and 70%, respectively [<xref ref-type="bibr" rid="ref55">55</xref>].</p>
        <p>Regarding the performance of ChatGPT on questions other than MCQs, 4 studies compared ChatGPT’s performance on open-ended questions versus MCQs. Among them, 2 showed that ChatGPT performed significantly worse on open-ended questions than on MCQs [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], 1 showed slightly better performance on open-ended questions, and another asked ChatGPT 10 short questions, all of which received an “A” grade [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. In a study using calculation questions from the Japanese medical licensing examination, ChatGPT’s performance on calculation questions was significantly worse than that of MCQs [<xref ref-type="bibr" rid="ref24">24</xref>]. In a study using patient history inquiry questions from the Chinese medical licensing examination to assess medical students’ clinical skills, ChatGPT passed the test and scored higher than the average medical student, achieving satisfactory performance [<xref ref-type="bibr" rid="ref53">53</xref>].</p>
      </sec>
      <sec>
        <title>Meta-Analysis</title>
        <p>We conducted a meta-analysis of the integrated accuracy of GPT-3.5 and GPT-4 in medical licensing examinations. The accuracy of the meta-analysis was displayed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. A total of 25 studies reporting the accuracy of GPT-3.5 and 29 studies reporting the accuracy of GPT-4 were included in this meta-analysis. Owing to significant heterogeneity (GPT-3.5: <italic>I</italic>²=95% and GPT-4 <italic>I</italic>²=93%), both groups were analyzed using a random-effects model.</p>
        <p>The integrated accuracy for GPT-3.5 was 58% (95% CI 53-63; <italic>P</italic>&#60;.01), and the integrated accuracy for GPT-4 was 81% (95% CI 78-84; <italic>P</italic>&#60;.01; Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
      </sec>
      <sec>
        <title>Meta-Regression and Subgroup Analysis</title>
        <p>We divided studies with GPT-3.5 and GPT-4 in Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref> into 3 subgroups, respectively.</p>
        <p>Subgroup 1 divided the studies into those using medical licensing examinations from English-speaking countries to test ChatGPT and those using examinations from non–English-speaking countries with a native language. Subgroup 2 categorized studies based on whether they used prompts to test ChatGPT or not. In Figures S3 and S4 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>, “yes” indicates the use of prompts, while “no” indicates the absence of prompts. Subgroup 3 categorized studies according to the “flow and timing” evaluation in <xref ref-type="table" rid="table1">Table 1</xref>, with “low risk” forming 1 category and “unclear” and “high risk” forming another. In Figures S5 and S6 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>, “yes” means “low risk,” implying that ChatGPT’s performance might not be affected by the testing date and source date. “No” means “high risk” and “unclear,” implying that ChatGPT’s performance might be influenced by the testing date and source date. We conducted meta-regression and subgroup analyses for all subgroups to examine potential sources of heterogeneity and compare performances.</p>
        <p>In subgroup analysis of subgroup 1, because of significant heterogeneity (GPT-3.5 tested in medical licensing examinations of English-speaking countries: <italic>I</italic>²=80%, GPT-3.5 tested in original language examinations of non–English-speaking countries: <italic>I</italic>²=96%, GPT-4 tested in medical licensing examinations of English-speaking countries: <italic>I</italic>²=69%, and GPT-4 tested in original language examinations of non–English-speaking countries: <italic>I</italic>²=93%), all 4 groups were analyzed using a random-effects model.</p>
        <p>The integrated accuracy for GPT-3.5 in examinations from English-speaking countries was 57% (95% CI 52-62; <italic>P</italic>&#60;.01), and in examinations from non–English-speaking countries with original languages, it was 58% (95% CI 52-64; <italic>P</italic>&#60;.01). No statistically significant differences were observed (<italic>P</italic>=.72; Figure S7 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
        <p>For GPT-4, the integrated accuracy in examinations from English-speaking countries was 86% (95% CI 82%-89%; <italic>P</italic>&#60;.01), and in examinations from non–English-speaking countries with original languages, it was 80% (95% CI 76-83; <italic>P</italic>&#60;.01). Statistically significant differences were observed between the results (<italic>P</italic>=.02; Figure S8 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
        <p>In the subgroup analysis of subgroup 2, because of significant heterogeneity (GPT-3.5 in subgroup “yes:” <italic>I</italic>²=92%, GPT-3.5 in subgroup “no:” <italic>I</italic>²=95%, GPT-4 in subgroup “yes:” <italic>I</italic>²=68%, and GPT-4 in subgroup “no:” <italic>I</italic>²=94%), all 4 groups were analyzed using a random-effects model.</p>
        <p>The integrated accuracy for GPT-3.5 in examinations with prompts was 68% (95% CI 57-77; <italic>P</italic>&#60;.01), and in examinations without prompts, it was 54% (95% CI 50-59; <italic>P</italic>&#60;.01). Statistically significant differences were observed between the results (<italic>P</italic>=.03; Figure S3 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
        <p>The integrated accuracy for GPT-4 in examinations with prompts was 85% (95% CI 83-88; <italic>P</italic>&#60;.01), and in examinations without prompts, it was 79% (95% CI 75-82; <italic>P</italic>&#60;.01). Statistically significant differences were observed between the results (<italic>P</italic>&#60;.01; Figure S4 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
        <p>In the subgroup analysis of subgroup 3, because of significant heterogeneity (GPT-3.5 in subgroup “yes:” <italic>I</italic>²=96%, GPT-3.5 in subgroup “no:” <italic>I</italic>²=92%, GPT-4 in subgroup “yes:” <italic>I</italic>²=71%, and GPT-4 in subgroup “no:” <italic>I</italic>²=95%), all 4 groups were analyzed using a random-effects model.</p>
        <p>The integrated accuracy for studies in which GPT-3.5’s performance may be influenced by testing date and source date was 55% (95% CI 51-60; <italic>P</italic>&#60;.01), and in studies in which GPT-3.5’s performance may not be influenced, it was 62% (95% CI 53-71; <italic>P</italic>&#60;.01). No statistically significant differences were observed (<italic>P</italic>=.19; Figure S5 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
        <p>The integrated accuracy for studies in which GPT-4’s performance may be influenced by testing date and source date was 80% (95% CI 75-83; <italic>P</italic>&#60;.01), and in studies in which GPT-4’s performance may not be influenced, it was 83% (95% CI 80-86; <italic>P</italic>&#60;.01). No statistically significant differences were observed (<italic>P</italic>=.12; Figure S6 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
        <p>Regarding the meta-regression results for all subgroups, the use of prompts is likely to be a source of potential heterogeneity and showed a significant effect on the accuracy rates of GPT-3.5 and GPT-4 (subgroup 2), as indicated by an estimated regression coefficient of 0.54 (<italic>P</italic>=.01) and 0.46 (<italic>P</italic>=.02), respectively. Meta-regression of subgroups 1 and 3 did not show statistically significant effects on accuracy rates (all <italic>P</italic>&#62;.05; <xref ref-type="table" rid="table3">Table 3</xref>).</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Meta-regression results of 3 subgroups of GPT-3.5 and GPT-4.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="400"/>
            <col width="0"/>
            <col width="430"/>
            <col width="0"/>
            <col width="140"/>
            <thead>
              <tr valign="top">
                <td colspan="3">Version</td>
                <td colspan="2">Estimated regression coefficient</td>
                <td><italic>P</italic> value</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>GPT-3.5</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 1</td>
                <td colspan="2">–0.03</td>
                <td colspan="2">.91</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 2</td>
                <td colspan="2">0.54</td>
                <td colspan="2">.01</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 3</td>
                <td colspan="2">0.28</td>
                <td colspan="2">.19</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>GPT-4</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 1</td>
                <td colspan="2">–0.39</td>
                <td colspan="2">.08</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 2</td>
                <td colspan="2">0.46</td>
                <td colspan="2">.02</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 3</td>
                <td colspan="2">0.25</td>
                <td colspan="2">.18</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Publication Bias</title>
        <p>No publication bias was detected among the included studies, as indicated by the funnel plots (Figure S9 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
      </sec>
      <sec>
        <title>Sensitivity Analyses</title>
        <p>We used a random effects model to assess the impact of excluding individual studies on overall effects. The sensitivity analysis plot showed that no single study significantly affected the overall meta-analysis results. This demonstrates the robustness of the meta-analysis results (Figure S10 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p>
      </sec>
      <sec>
        <title>Power Analysis</title>
        <p>We conducted post hoc power analysis for the main groups and subgroups using the results of the random effects model (<xref ref-type="table" rid="table4">Table 4</xref>). Subgroup 1 of GPT-3.5 had a power of 0.17. In this subgroup, we believe the sample size is adequate. The low power might be due to 2 main reasons. First, the intergroup difference is minimal, with effect sizes being very close (58% and 57%). Second, the data may have high heterogeneity (<italic>I</italic>²=80% and 96%). In the main group and other subgroups, the power was 1 or close to 1, indicating sufficient power to detect the anticipated effect size with the given sample size for the random effects model.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Power analysis results of main groups and subgroups.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="90"/>
            <col width="580"/>
            <col width="330"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Versions and groups</td>
                <td>Power</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>GPT-3.5</td>
                <td>Main group (integrated accuracy rate in Figure S1 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>)</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>GPT-4</td>
                <td>Main group (integrated accuracy rate in Figure S2 in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>)</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>GPT-3.5</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 1</td>
                <td>0.17</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 2</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 3</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <bold>GPT-4</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 1</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 2</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Subgroup 3</td>
                <td> 0.98</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Our systematic review and meta-analysis are the first to comprehensively evaluate the performance of all versions of ChatGPT across various medical licensing examination environments. Overall, GPT-4 significantly outperformed GPT-3.5; however, there are still some issues that make it difficult to use in medical education at this stage.</p>
        <p>Regarding the accuracy of ChatGPT on MCQs, while 2 previous studies conducted meta-analyses that yielded accuracy rates of 61% and 56%, respectively, we noted that these accuracy rates reflected the performance of all versions of ChatGPT without differentiation by version [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref56">56</xref>]. Our review found that GPT-4 achieved an integrated accuracy rate of 81% for MCQs in medical licensing examinations, passing nearly all tested examinations and surpassing the average performance of medical students in three-quarters of the tests. In contrast, GPT-3.5 achieved an integrated accuracy rate of 58%, failing to pass more than half of the medical examinations and surpassing the average performance of medical students in only 4 of 14 tests. Therefore, regarding accuracy rate, passing rate, and comparison with medical students, GPT-4 significantly surpassed GPT-3.5.</p>
        <p>In medical licensing examinations from non–English-speaking countries, translating the original language questions into English significantly improved GPT-3.5’s performance but did not affect GPT-4’s performance. This indicates that GPT-4 has a much higher proficiency in languages other than English than GPT-3.5. However, based on the results of subgroup analysis for comparing GPT-3.5 and GPT-4 in medical licensing examinations from English-speaking and non–English-speaking countries, we found that GPT-4 performed better in English-speaking countries. In contrast, GPT-3.5 showed no performance difference between examinations from English-speaking and non–English-speaking countries.</p>
        <p>Additionally, based on the results of qualitative analysis and subgroup analysis, we found that both “optimized prompts” and “task understanding prompts” could significantly improve ChatGPT’s performance. When using prompts, the accuracy rates of GPT-3.5 and GPT-4 were 68% and 85%, respectively, which were significantly higher than the accuracy rates of 54% and 79% without prompts.</p>
        <p>The testing date and source date of each study were not sources of potential heterogeneity and did not significantly affect the performance of ChatGPT.</p>
      </sec>
      <sec>
        <title>Challenge of Using ChatGPT in Medical Education</title>
        <p>First, although the AI hallucinations of GPT-4 have significantly been reduced compared to earlier versions, GPT-4 still generates incorrect information because the data used to train these models is not always correct [<xref ref-type="bibr" rid="ref65">65</xref>]. We observed that in all tests of GPT-4, only 2 instances achieved an accuracy rate above 90%. The only example of a perfect accuracy rate was in the UK study, in which GPT-4 correctly answered all 20 questions [<xref ref-type="bibr" rid="ref16">16</xref>]. However, the number of questions used in this test was significantly lower than those used in other studies. We believe that this demonstrates ChatGPT’s potential for future use in medical education but does not imply that medical students can rely on ChatGPT to acquire medical knowledge or prepare for examinations. Traditional sources of medical knowledge, such as medical school courses and textbooks, are completely reliable. However, because most professional medical knowledge exists in book form [<xref ref-type="bibr" rid="ref50">50</xref>] and medical expertise on the internet is not always reliable [<xref ref-type="bibr" rid="ref66">66</xref>], the medical knowledge that ChatGPT currently holds is not entirely accurate. In this context, if medical students rely on ChatGPT as a trusted source of expertise and acquire incorrect medical knowledge, the reliability of their knowledge and skills is significantly compromised. This is unacceptable in the medical field, as it directly impacts human lives. Therefore, GPT-4 passing medical licensing examinations does not imply that it can be used as a source of knowledge in medical education.</p>
        <p>Previous studies have noted that the responses generated by GPT-3.5 are nondeterministic and random [<xref ref-type="bibr" rid="ref67">67</xref>-<xref ref-type="bibr" rid="ref69">69</xref>]. This study found that although the stability of GPT-4 has significantly improved compared to that of GPT-3.5, it still exhibits a degree of randomness in its outputs. Although GPT-4 achieved an overall accuracy of 81% across all tests, it only scored 52% on the Korean medical licensing examination, even lower than the overall accuracy of GPT-3.5 (58%) [<xref ref-type="bibr" rid="ref25">25</xref>]. In addition, in 4 studies using Japanese medical licensing examination questions, although GPT-4 passed 3 of the tests, it only achieved an accuracy of 67% in one and did not pass [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. Furthermore, the use of optimized prompts and the difficulty of the questions can affect ChatGPT’s performance stability. If millions of medical students use ChatGPT for learning, this randomness could be significantly magnified and affect their learning outcomes.</p>
        <p>Moreover, different countries’ medical policies, and cultural, ethical, and unique local traditional medical knowledge pose significant challenges for ChatGPT [<xref ref-type="bibr" rid="ref70">70</xref>]. Regarding varying medical policies and ethics, a Chinese study mentioned that abortion is prohibited in the United States but allowed in certain circumstances in China [<xref ref-type="bibr" rid="ref48">48</xref>]. Although euthanasia is legal in many countries, it is illegal in Japan. ChatGPT chose the option of euthanasia in the Japanese medical licensing examination [<xref ref-type="bibr" rid="ref25">25</xref>]. ChatGPT may struggle to adapt to localized medical policies and ethics. In addition, East Asian countries still use local traditional medicine (eg, Chinese medicine), and most local traditional medicine learning materials are written in the native languages. These materials might not be accessible on the internet and included in ChatGPT’s training data set, making it difficult for ChatGPT to provide accurate answers to such topics [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref54">54</xref>].</p>
        <p>In the evaluation of image-based questions, we observed significant variations in the performance of GPT-4, with accuracy rates ranging from 13% to 100% [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. However, there were only 3 questions in which GPT-4 achieved 100% accuracy, which is too small a sample size to demonstrate its proficiency in handling image-based questions [<xref ref-type="bibr" rid="ref16">16</xref>]. In addition, a study from Japan tested the performance of ChatGPT when provided with images and text versus text only. Surprisingly, ChatGPT performed better when given only text than when provided with both images and text [<xref ref-type="bibr" rid="ref38">38</xref>]. Similarly, Chile found that GPT-4V, designed explicitly for image tasks, performed worse on image-based questions than GPT-4 [<xref ref-type="bibr" rid="ref55">55</xref>]. We believe that studies testing the ChatGPT’s performance on image-based questions are limited at this stage. Therefore, comprehensive and reliable conclusions cannot be drawn. Consequently, using ChatGPT for image-based medical education is extremely risky.</p>
        <p>Finally, human teachers usually recognize their knowledge limitations when faced with uncertain questions and correct their mistakes by consulting resources. However, the fatal issue with ChatGPT is that, owing to the nature of AI language models, it can provide detailed and logically sound explanations for incorrect answers [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Given ChatGPT’s authoritative writing style, students are likely to believe and memorize the incorrect information provided by ChatGPT [<xref ref-type="bibr" rid="ref71">71</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This systematic review did not include studies on the performance of ChatGPT in various medical specialty examinations, dental licensing examinations, pharmacy examinations, and other medical-related assessments. Future studies should review the performance of ChatGPT in these specific medical fields. Studies published in languages other than English were excluded from the systematic review. This may omit the literature that tests the performance of ChatGPT on non–English-speaking medical licensing examinations.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>A total of 45 studies on the performance of different versions of ChatGPT in medical licensing examinations were included in this systematic review. GPT-4 achieved an overall accuracy rate of 81%, significantly surpassing GPT-3.5, and, in most cases, passed the medical examinations, outperforming the average scores of medical students. Thus, GPT-4 demonstrates considerable potential for future use in medical education. However, because the knowledge of ChatGPT is not entirely accurate and its performance can be inconsistent, and because of the challenges posed by differing medical policies and knowledge across countries, we believe that GPT-4 is not yet suitable for use in medical education.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Query strings of Web of Science, Scopus, and PubMed.</p>
        <media xlink:href="jmir_v26i1e60807_app1.docx" xlink:title="DOCX File , 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Evaluation framework used in this systematic review.</p>
        <media xlink:href="jmir_v26i1e60807_app2.docx" xlink:title="DOCX File , 19 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>General characteristics of included studies.</p>
        <media xlink:href="jmir_v26i1e60807_app3.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 17 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Supplementary figures.</p>
        <media xlink:href="jmir_v26i1e60807_app4.pptx" xlink:title="PPTX File , 1105 KB"/>
      </supplementary-material>
      <supplementary-material id="app5">
        <label>Multimedia Appendix 5</label>
        <p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) 2020 checklist.</p>
        <media xlink:href="jmir_v26i1e60807_app5.docx" xlink:title="DOCX File , 31 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MCQ</term>
          <def>
            <p>multiple-choice question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NBME</term>
          <def>
            <p>National Board of Medical Examiners</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">PRISMA</term>
          <def>
            <p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">PROSPERO</term>
          <def>
            <p>International Prospective Register of Systematic Reviews</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">QUADAS-2</term>
          <def>
            <p>Quality Assessment of Diagnostic Accuracy Studies-2</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by Japan Society for Promotion of Science (JSPS) Grants-in-Aid for Scientific Research (KAKENHI) grant (24KJ0830).</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="web">
          <article-title>ChatGPT</article-title>
          <source>OpenAI</source>
          <access-date>2024-02-12</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://chat.openai.com/chat">https://chat.openai.com/chat</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khlaif</surname>
              <given-names>ZN</given-names>
            </name>
            <name name-style="western">
              <surname>Mousa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hattab</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Itmazi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Sanmugam</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ayyoub</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The potential and concerns of using AI in scientific research: ChatGPT performance evaluation</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e47049</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e47049/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47049</pub-id>
          <pub-id pub-id-type="medline">37707884</pub-id>
          <pub-id pub-id-type="pii">v9i1e47049</pub-id>
          <pub-id pub-id-type="pmcid">PMC10636627</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Borchert</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hickman</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Pepys</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sadler</surname>
              <given-names>TJ</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on the situational judgement test-a professional dilemmas-based examination for doctors in the United Kingdom</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48978</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48978/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48978</pub-id>
          <pub-id pub-id-type="medline">37548997</pub-id>
          <pub-id pub-id-type="pii">v9i1e48978</pub-id>
          <pub-id pub-id-type="pmcid">PMC10442724</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Watanobe</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT for education and research: Opportunities, threats, and strategies</article-title>
          <source>Appl. Sci</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>9</issue>
          <fpage>5783</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.3390/app13095783"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/app13095783</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mo</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Exploring the performance of ChatGPT-4 in the Taiwan audiologist qualification examination: preliminary observational study highlighting the potential of AI chatbots in hearing care</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e55595</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e55595/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/55595</pub-id>
          <pub-id pub-id-type="medline">38693697</pub-id>
          <pub-id pub-id-type="pii">v10i1e55595</pub-id>
          <pub-id pub-id-type="pmcid">PMC11067446</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kuroiwa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sarcon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ibara</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yamada</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Yamamoto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tsukamoto</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fujita</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>The Potential of ChatGPT as a self-diagnostic tool in common orthopedic diseases: exploratory study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e47621</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e47621/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47621</pub-id>
          <pub-id pub-id-type="medline">37713254</pub-id>
          <pub-id pub-id-type="pii">v25i1e47621</pub-id>
          <pub-id pub-id-type="pmcid">PMC10541638</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yeganova</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>PT</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Comeau</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Islamaj</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kapoor</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Opportunities and challenges for ChatGPT and large language models in biomedicine and health</article-title>
          <source>Brief Bioinform</source>
          <year>2023</year>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>bbad493</fpage>
          <pub-id pub-id-type="doi">10.1093/bib/bbad493</pub-id>
          <pub-id pub-id-type="medline">38168838</pub-id>
          <pub-id pub-id-type="pii">7505071</pub-id>
          <pub-id pub-id-type="pmcid">PMC10762511</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gödde</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Nöhl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rupert</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rimkus</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ehlers</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Breuckmann</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sellmann</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A SWOT (Strengths, Weaknesses, Opportunities, and Threats) analysis of ChatGPT in the medical literature: concise review</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e49368</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e49368/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/49368</pub-id>
          <pub-id pub-id-type="medline">37865883</pub-id>
          <pub-id pub-id-type="pii">v25i1e49368</pub-id>
          <pub-id pub-id-type="pmcid">PMC10690535</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsang</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Practical applications of ChatGPT in undergraduate medical education</article-title>
          <source>J Med Educ Curric Dev</source>
          <year>2023</year>
          <volume>10</volume>
          <fpage>23821205231178449</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/23821205231178449?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/23821205231178449</pub-id>
          <pub-id pub-id-type="medline">37255525</pub-id>
          <pub-id pub-id-type="pii">10.1177_23821205231178449</pub-id>
          <pub-id pub-id-type="pmcid">PMC10226299</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hristidis</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ruggiano</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>EL</given-names>
            </name>
            <name name-style="western">
              <surname>Ganta</surname>
              <given-names>SRR</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT vs google for queries related to dementia and other cognitive decline: comparison of results</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e48966</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48966/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48966</pub-id>
          <pub-id pub-id-type="medline">37490317</pub-id>
          <pub-id pub-id-type="pii">v25i1e48966</pub-id>
          <pub-id pub-id-type="pmcid">PMC10410383</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The rise of ChatGPT: exploring its potential in medical education</article-title>
          <source>Anat Sci Educ</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.1002/ase.2270</pub-id>
          <pub-id pub-id-type="medline">36916887</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Price</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Lynn</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Coombes</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gale</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>de Bere</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Archer</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>The international landscape of medical licensing examinations: a typology derived from a systematic review</article-title>
          <source>Int J Health Policy Manag</source>
          <year>2018</year>
          <volume>7</volume>
          <issue>9</issue>
          <fpage>782</fpage>
          <lpage>790</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/30316226"/>
          </comment>
          <pub-id pub-id-type="doi">10.15171/ijhpm.2018.32</pub-id>
          <pub-id pub-id-type="medline">30316226</pub-id>
          <pub-id pub-id-type="pmcid">PMC6186476</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alessandri Bonetti</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Giorgino</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gallo Afflitto</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>De Lorenzi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Egro</surname>
              <given-names>FM</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the italian residency admission national exam compared to 15,869 medical graduates?</article-title>
          <source>Ann Biomed Eng</source>
          <year>2024</year>
          <volume>52</volume>
          <issue>4</issue>
          <fpage>745</fpage>
          <lpage>749</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03318-7</pub-id>
          <pub-id pub-id-type="medline">37490183</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03318-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aljindan</surname>
              <given-names>FK</given-names>
            </name>
            <name name-style="western">
              <surname>Al Qurashi</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Albalawi</surname>
              <given-names>IAS</given-names>
            </name>
            <name name-style="western">
              <surname>Alanazi</surname>
              <given-names>AMM</given-names>
            </name>
            <name name-style="western">
              <surname>Aljuhani</surname>
              <given-names>HAM</given-names>
            </name>
            <name name-style="western">
              <surname>Falah Almutairi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Aldamigh</surname>
              <given-names>OA</given-names>
            </name>
            <name name-style="western">
              <surname>Halawani</surname>
              <given-names>IR</given-names>
            </name>
            <name name-style="western">
              <surname>K Zino Alarki</surname>
              <given-names>SM</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT conquers the Saudi medical licensing exam: exploring the accuracy of artificial intelligence in medical knowledge assessment and implications for modern medical education</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>9</issue>
          <fpage>e45043</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37829968"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.45043</pub-id>
          <pub-id pub-id-type="medline">37829968</pub-id>
          <pub-id pub-id-type="pmcid">PMC10566535</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Armitage</surname>
              <given-names>RC</given-names>
            </name>
          </person-group>
          <article-title>Performance of generative pre-trained transformer-4 (GPT-4) in membership of the royal college of general practitioners (MRCGP)-style examination questions</article-title>
          <source>Postgrad Med J</source>
          <year>2024</year>
          <volume>100</volume>
          <issue>1182</issue>
          <fpage>274</fpage>
          <lpage>275</lpage>
          <pub-id pub-id-type="doi">10.1093/postmj/qgad128</pub-id>
          <pub-id pub-id-type="medline">38142282</pub-id>
          <pub-id pub-id-type="pii">7492707</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ebrahimian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Behnam</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ghayebi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sobhrakhshankhah</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in Iranian medical licensing examination: evaluating the diagnostic accuracy and decision-making capabilities of an AI-based model</article-title>
          <source>BMJ Health Care Inform</source>
          <year>2023</year>
          <volume>30</volume>
          <issue>1</issue>
          <fpage>e100815</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://informatics.bmj.com/lookup/pmidlookup?view=long&#38;pmid=38081765"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjhci-2023-100815</pub-id>
          <pub-id pub-id-type="medline">38081765</pub-id>
          <pub-id pub-id-type="pii">bmjhci-2023-100815</pub-id>
          <pub-id pub-id-type="pmcid">PMC10729145</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ling</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT-4 preform on non-english national medical licensing examination? An evaluation in Chinese language</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>12</issue>
          <fpage>e0000397</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38039286"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000397</pub-id>
          <pub-id pub-id-type="medline">38039286</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-23-00175</pub-id>
          <pub-id pub-id-type="pmcid">PMC10691691</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Flores-Cohaila</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>García-Vicente</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vizcarra-Jiménez</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>De la Cruz-Galán</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Gutiérrez-Arratia</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Quiroga Torres</surname>
              <given-names>BG</given-names>
            </name>
            <name name-style="western">
              <surname>Taype-Rondan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on the Peruvian national licensing medical examination: cross-sectional study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48039</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48039/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48039</pub-id>
          <pub-id pub-id-type="medline">37768724</pub-id>
          <pub-id pub-id-type="pii">v9i1e48039</pub-id>
          <pub-id pub-id-type="pmcid">PMC10570896</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Garabet</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mackey</surname>
              <given-names>BP</given-names>
            </name>
            <name name-style="western">
              <surname>Cross</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weingarten</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT-4 performance on USMLE step 1 style questions and its implications for medical education: a comparative study across systems and disciplines</article-title>
          <source>Med Sci Educ</source>
          <year>2024</year>
          <volume>34</volume>
          <issue>1</issue>
          <fpage>145</fpage>
          <lpage>152</lpage>
          <pub-id pub-id-type="doi">10.1007/s40670-023-01956-z</pub-id>
          <pub-id pub-id-type="medline">38510401</pub-id>
          <pub-id pub-id-type="pii">1956</pub-id>
          <pub-id pub-id-type="pmcid">PMC10948644</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gobira</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nakayama</surname>
              <given-names>LF</given-names>
            </name>
            <name name-style="western">
              <surname>Moreira</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Andrade</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Regatieri</surname>
              <given-names>CVS</given-names>
            </name>
            <name name-style="western">
              <surname>Belfort</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT-4 in answering questions from the Brazilian national examination for medical degree revalidation</article-title>
          <source>Rev Assoc Med Bras (1992)</source>
          <year>2023</year>
          <volume>69</volume>
          <issue>10</issue>
          <fpage>e20230848</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.scielo.br/scielo.php?script=sci_arttext&#38;pid=S0104-42302023001000618&#38;lng=en&#38;nrm=iso&#38;tlng=en"/>
          </comment>
          <pub-id pub-id-type="doi">10.1590/1806-9282.20230848</pub-id>
          <pub-id pub-id-type="medline">37792871</pub-id>
          <pub-id pub-id-type="pii">S0104-42302023001000618</pub-id>
          <pub-id pub-id-type="pmcid">PMC10547492</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guillen-Grima</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Guillen-Aguinaga</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Guillen-Aguinaga</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Alas-Brun</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Onambele</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ortega</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Montejo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Aguinaga-Ontoso</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Barach</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Aguinaga-Ontoso</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the efficacy of ChatGPT in navigating the Spanish medical residency entrance examination (MIR): promising horizons for AI in clinical medicine</article-title>
          <source>Clin Pract</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>1460</fpage>
          <lpage>1487</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=clinpract13060130"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/clinpract13060130</pub-id>
          <pub-id pub-id-type="medline">37987431</pub-id>
          <pub-id pub-id-type="pii">clinpract13060130</pub-id>
          <pub-id pub-id-type="pmcid">PMC10660543</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haze</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kawano</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Takase</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Suzuki</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hirawa</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tamura</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Influence on the accuracy in ChatGPT: differences in the amount of information per medical field</article-title>
          <source>Int J Med Inform</source>
          <year>2023</year>
          <volume>180</volume>
          <fpage>105283</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105283</pub-id>
          <pub-id pub-id-type="medline">37931432</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(23)00301-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>CH</given-names>
            </name>
            <name name-style="western">
              <surname>Hsiao</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Yeh</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on stage 1 of the Taiwanese medical licensing exam</article-title>
          <source>Digit Health</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>20552076241233144</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/20552076241233144?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/20552076241233144</pub-id>
          <pub-id pub-id-type="medline">38371244</pub-id>
          <pub-id pub-id-type="pii">10.1177_20552076241233144</pub-id>
          <pub-id pub-id-type="pmcid">PMC10874144</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yun</surname>
              <given-names>TR</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CY</given-names>
            </name>
            <name name-style="western">
              <surname>Kwon</surname>
              <given-names>YK</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>CE</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 can pass the Korean national licensing examination for Korean medicine doctors</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>12</issue>
          <fpage>e0000416</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38100393"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000416</pub-id>
          <pub-id pub-id-type="medline">38100393</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-23-00147</pub-id>
          <pub-id pub-id-type="pmcid">PMC10723673</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jung</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Gudera</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegand</surname>
              <given-names>TLT</given-names>
            </name>
            <name name-style="western">
              <surname>Allmendinger</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dimitriadis</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Koerte</surname>
              <given-names>IK</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT passes German state examination in medicine with picture questions omitted</article-title>
          <source>Dtsch Arztebl Int</source>
          <year>2023</year>
          <volume>120</volume>
          <issue>21</issue>
          <fpage>373</fpage>
          <lpage>374</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37530052"/>
          </comment>
          <pub-id pub-id-type="doi">10.3238/arztebl.m2023.0113</pub-id>
          <pub-id pub-id-type="medline">37530052</pub-id>
          <pub-id pub-id-type="pii">arztebl.m2023.0113</pub-id>
          <pub-id pub-id-type="pmcid">PMC10413971</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Chuang</surname>
              <given-names>WK</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Use of ChatGPT on Taiwan's examination for medical doctors</article-title>
          <source>Ann Biomed Eng</source>
          <year>2024</year>
          <volume>52</volume>
          <issue>3</issue>
          <fpage>455</fpage>
          <lpage>457</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03308-9</pub-id>
          <pub-id pub-id-type="medline">37432530</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03308-9</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kataoka</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yamamoto-Kataoka</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Furukawa</surname>
              <given-names>TA</given-names>
            </name>
          </person-group>
          <article-title>Beyond the pass mark: accuracy of ChatGPT and bing in the national medical licensure examination in Japan</article-title>
          <source>JMA J</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>4</issue>
          <fpage>536</fpage>
          <lpage>538</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37941716"/>
          </comment>
          <pub-id pub-id-type="doi">10.31662/jmaj.2023-0043</pub-id>
          <pub-id pub-id-type="medline">37941716</pub-id>
          <pub-id pub-id-type="pmcid">PMC10628311</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khorshidi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Mohammadi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yousem</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Abolghasemi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ansari</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mirza-Aghazadeh-Attari</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Acharya</surname>
              <given-names>UR</given-names>
            </name>
            <name name-style="western">
              <surname>Abbasian Ardakani</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Application of ChatGPT in multilingual medical education: how does ChatGPT fare in 2023's Iranian residency entrance examination</article-title>
          <source>Informatics in Medicine Unlocked</source>
          <year>2023</year>
          <volume>41</volume>
          <fpage>101314</fpage>
          <pub-id pub-id-type="doi">10.1016/j.imu.2023.101314</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kleinig</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bacchi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>This too shall pass: the performance of ChatGPT-3.5, ChatGPT-4 and new bing in an Australian medical licensing examination</article-title>
          <source>Med J Aust</source>
          <year>2023</year>
          <volume>219</volume>
          <issue>5</issue>
          <fpage>237</fpage>
          <pub-id pub-id-type="doi">10.5694/mja2.52061</pub-id>
          <pub-id pub-id-type="medline">37528548</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kleinig</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Kovoor</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Bacchi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Universal precautions required: artificial intelligence takes on the Australian Medical Council's trial examination</article-title>
          <source>Aust J Gen Pract</source>
          <year>2023</year>
          <volume>52</volume>
          <issue>12</issue>
          <fpage>863</fpage>
          <lpage>865</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.racgp.org.au/AJGP/2023/december/universal-precautions-required/"/>
          </comment>
          <pub-id pub-id-type="doi">10.31128/AJGP-02-23-6708</pub-id>
          <pub-id pub-id-type="medline">38049136</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Alfertshofer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hoch</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Funk</surname>
              <given-names>PF</given-names>
            </name>
            <name name-style="western">
              <surname>Cotofana</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Maheta</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Brébant</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Prantl</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lamby</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Pure wisdom or potemkin villages? A comparison of ChatGPT 3.5 and ChatGPT 4 on USMLE step 3 style questions: quantitative analysis</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e51148</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e51148/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51148</pub-id>
          <pub-id pub-id-type="medline">38180782</pub-id>
          <pub-id pub-id-type="pii">v10i1e51148</pub-id>
          <pub-id pub-id-type="pmcid">PMC10799278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>UH</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>KS</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>TY</given-names>
            </name>
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>JKC</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the performance of ChatGPT-4 on the United Kingdom medical licensing assessment</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2023</year>
          <volume>10</volume>
          <fpage>1240915</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37795422"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2023.1240915</pub-id>
          <pub-id pub-id-type="medline">37795422</pub-id>
          <pub-id pub-id-type="pmcid">PMC10547055</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>SY</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Kao</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Exploring the proficiency of ChatGPT-4: An evaluation of its performance in the Taiwan advanced medical licensing examination</article-title>
          <source>Digit Health</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>20552076241237678</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/20552076241237678?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/20552076241237678</pub-id>
          <pub-id pub-id-type="medline">38449683</pub-id>
          <pub-id pub-id-type="pii">10.1177_20552076241237678</pub-id>
          <pub-id pub-id-type="pmcid">PMC10916498</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Riese</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Streichert</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Comparison of the performance of GPT-3.5 and GPT-4 with that of medical students on the written German medical licensing examination: observational study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e50965</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e50965/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50965</pub-id>
          <pub-id pub-id-type="medline">38329802</pub-id>
          <pub-id pub-id-type="pii">v10i1e50965</pub-id>
          <pub-id pub-id-type="pmcid">PMC10884900</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mihalache</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Popovic</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Muni</surname>
              <given-names>RH</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT-4: an assessment of an upgraded artificial intelligence chatbot in the United States medical licensing examination</article-title>
          <source>Med Teach</source>
          <year>2024</year>
          <volume>46</volume>
          <issue>3</issue>
          <fpage>366</fpage>
          <lpage>372</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159X.2023.2249588</pub-id>
          <pub-id pub-id-type="medline">37839017</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nakao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Miki</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nakamura</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kikuchi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nomura</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hanaoka</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yoshikawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Abe</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Capability of GPT-4V(ision) in the Japanese national medical licensing examination: evaluation study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e54393</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e54393/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/54393</pub-id>
          <pub-id pub-id-type="medline">38470459</pub-id>
          <pub-id pub-id-type="pii">v10i1e54393</pub-id>
          <pub-id pub-id-type="pmcid">PMC10966435</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Oztermeli</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Oztermeli</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performance in the medical specialty exam: an observational study</article-title>
          <source>Medicine (Baltimore)</source>
          <year>2023</year>
          <volume>102</volume>
          <issue>32</issue>
          <fpage>e34673</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37565917"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MD.0000000000034673</pub-id>
          <pub-id pub-id-type="medline">37565917</pub-id>
          <pub-id pub-id-type="pii">00005792-202308110-00076</pub-id>
          <pub-id pub-id-type="pmcid">PMC10419419</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roos</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kasapovic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jansen</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kaczmarczyk</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in medical education: comparative analysis of ChatGPT, bing, and medical students in Germany</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e46482</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46482/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46482</pub-id>
          <pub-id pub-id-type="medline">37665620</pub-id>
          <pub-id pub-id-type="pii">v9i1e46482</pub-id>
          <pub-id pub-id-type="pmcid">PMC10507517</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rosoł</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gąsior</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Łaba</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Korzeniewski</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Młyńczak</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of the performance of GPT-3.5 and GPT-4 on the Polish medical final examination</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>20512</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-46995-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-46995-z</pub-id>
          <pub-id pub-id-type="medline">37993519</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-46995-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC10665355</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Scaioli</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lo Moro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Conrado</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Rosset</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bert</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Siliquini</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Exploring the potential of ChatGPT for clinical reasoning and decision-making: a cross-sectional study on the Italian medical residency exam</article-title>
          <source>Ann Ist Super Sanita</source>
          <year>2023</year>
          <volume>59</volume>
          <issue>4</issue>
          <fpage>267</fpage>
          <lpage>270</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.4415/ANN_23_04_05"/>
          </comment>
          <pub-id pub-id-type="doi">10.4415/ANN_23_04_05</pub-id>
          <pub-id pub-id-type="medline">38088393</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xue</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT pass China's national medical licensing examination?</article-title>
          <source>Asian J Surg</source>
          <year>2023</year>
          <volume>46</volume>
          <issue>12</issue>
          <fpage>6112</fpage>
          <lpage>6113</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1015-9584(23)01505-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.asjsur.2023.09.089</pub-id>
          <pub-id pub-id-type="medline">37775381</pub-id>
          <pub-id pub-id-type="pii">S1015-9584(23)01505-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takagi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Watari</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Erabi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sakaguchi</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Performance of GPT-3.5 and GPT-4 on the Japanese medical licensing examination: comparison study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48002</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48002/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48002</pub-id>
          <pub-id pub-id-type="medline">37384388</pub-id>
          <pub-id pub-id-type="pii">v9i1e48002</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365615</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Guan</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in global health equity: an evaluation and discussion on the application of ChatGPT, in the Chinese national medical licensing examination</article-title>
          <source>Front Med (Lausanne)</source>
          <year>2023</year>
          <volume>10</volume>
          <fpage>1237432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38020160"/>
          </comment>
          <pub-id pub-id-type="doi">10.3389/fmed.2023.1237432</pub-id>
          <pub-id pub-id-type="medline">38020160</pub-id>
          <pub-id pub-id-type="pmcid">PMC10656681</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Torres-Zegarra</surname>
              <given-names>BC</given-names>
            </name>
            <name name-style="western">
              <surname>Rios-Garcia</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ñaña-Cordova</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Arteaga-Cisneros</surname>
              <given-names>KF</given-names>
            </name>
            <name name-style="western">
              <surname>Chalco</surname>
              <given-names>XCB</given-names>
            </name>
            <name name-style="western">
              <surname>Ordoñez</surname>
              <given-names>MAB</given-names>
            </name>
            <name name-style="western">
              <surname>Rios</surname>
              <given-names>CJG</given-names>
            </name>
            <name name-style="western">
              <surname>Godoy</surname>
              <given-names>CAR</given-names>
            </name>
            <name name-style="western">
              <surname>Quezada</surname>
              <given-names>KLTP</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez-Arratia</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Flores-Cohaila</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT, Bard, Claude, and Bing on the Peruvian national licensing medical examination: a cross-sectional study</article-title>
          <source>J Educ Eval Health Prof</source>
          <year>2023</year>
          <volume>20</volume>
          <fpage>30</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.doi.org/10.3352/jeehp.2023.20.30"/>
          </comment>
          <pub-id pub-id-type="doi">10.3352/jeehp.2023.20.30</pub-id>
          <pub-id pub-id-type="medline">37981579</pub-id>
          <pub-id pub-id-type="pii">jeehp.2023.20.30</pub-id>
          <pub-id pub-id-type="pmcid">PMC11009012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Dou</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Performance and exploration of ChatGPT in medical examination, records and education in Chinese: pave the way for medical AI</article-title>
          <source>Int J Med Inform</source>
          <year>2023</year>
          <volume>177</volume>
          <fpage>105173</fpage>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105173</pub-id>
          <pub-id pub-id-type="medline">37549499</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(23)00191-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performs on the Chinese national medical licensing examination</article-title>
          <source>J Med Syst</source>
          <year>2023</year>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id>
          <pub-id pub-id-type="medline">37581690</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-023-01961-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Watari</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Takagi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sakaguchi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Nishizaki</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Yamamoto</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tokuda</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Performance comparison of ChatGPT-4 and Japanese medical residents in the general medicine in-training examination: comparison study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e52202</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e52202/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/52202</pub-id>
          <pub-id pub-id-type="medline">38055323</pub-id>
          <pub-id pub-id-type="pii">v9i1e52202</pub-id>
          <pub-id pub-id-type="pmcid">PMC10733815</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>YM</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hwang</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT failed Taiwan's family medicine board exam</article-title>
          <source>J Chin Med Assoc</source>
          <year>2023</year>
          <volume>86</volume>
          <issue>8</issue>
          <fpage>762</fpage>
          <lpage>766</lpage>
          <pub-id pub-id-type="doi">10.1097/JCMA.0000000000000946</pub-id>
          <pub-id pub-id-type="medline">37294147</pub-id>
          <pub-id pub-id-type="pii">02118582-990000000-00224</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yanagita</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yokokawa</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Uchida</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tawara</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ikusaka</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of ChatGPT on medical questions in the national medical licensing examination in Japan: evaluation study</article-title>
          <source>JMIR Form Res</source>
          <year>2023</year>
          <volume>7</volume>
          <fpage>e48023</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://formative.jmir.org/2023//e48023/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48023</pub-id>
          <pub-id pub-id-type="medline">37831496</pub-id>
          <pub-id pub-id-type="pii">v7i1e48023</pub-id>
          <pub-id pub-id-type="pmcid">PMC10612006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yaneva</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Baldwin</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jurich</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Swygert</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Clauser</surname>
              <given-names>BE</given-names>
            </name>
          </person-group>
          <article-title>Examining ChatGPT performance on USMLE sample items and implications for assessment</article-title>
          <source>Acad Med</source>
          <year>2024</year>
          <volume>99</volume>
          <issue>2</issue>
          <fpage>192</fpage>
          <lpage>197</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000005549</pub-id>
          <pub-id pub-id-type="medline">37934828</pub-id>
          <pub-id pub-id-type="pii">00001888-202402000-00024</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ying</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT's potential role in non-English-speaking outpatient clinic settings</article-title>
          <source>Digit Health</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>20552076231184091</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/20552076231184091?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/20552076231184091</pub-id>
          <pub-id pub-id-type="medline">37434733</pub-id>
          <pub-id pub-id-type="pii">10.1177_20552076231184091</pub-id>
          <pub-id pub-id-type="pmcid">PMC10331772</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on Chinese national medical licensing examinations: a five-year examination evaluation study for physicians, pharmacists and nurses</article-title>
          <source>BMC Med Educ</source>
          <year>2024</year>
          <volume>24</volume>
          <issue>1</issue>
          <fpage>143</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-024-05125-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-024-05125-7</pub-id>
          <pub-id pub-id-type="medline">38355517</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-024-05125-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10868058</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rojas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rojas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Burgess</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Toro-Pérez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Salehi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Exploring the performance of ChatGPT versions 3.5, 4, and 4 with vision in the Chilean medical licensing examination: observational study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e55048</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e55048/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/55048</pub-id>
          <pub-id pub-id-type="medline">38686550</pub-id>
          <pub-id pub-id-type="pii">v10i1e55048</pub-id>
          <pub-id pub-id-type="pmcid">PMC11082432</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Keshtkar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hayat</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Atighi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ayare</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Keshtkar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yazdanpanahi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sadeghi</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Deilami</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Reihani</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Karimi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mokhtari</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hashempur</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <source>ChatGPT's performance on Iran's medical licensing exams</source>
          <access-date>2024-06-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.21203/rs.3.rs-3253417/v1">https://doi.org/10.21203/rs.3.rs-3253417/v1</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levin</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Horesh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Brezinov</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT in medical examinations: a systematic review and a meta-analysis</article-title>
          <source>BJOG</source>
          <year>2024</year>
          <volume>131</volume>
          <issue>3</issue>
          <fpage>378</fpage>
          <lpage>380</lpage>
          <pub-id pub-id-type="doi">10.1111/1471-0528.17641</pub-id>
          <pub-id pub-id-type="medline">37604703</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sumbal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sumbal</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Amir</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT-3.5 pass a medical exam? A systematic review of ChatGPT's performance in academic testing</article-title>
          <source>J Med Educ Curric Dev</source>
          <year>2024</year>
          <volume>11</volume>
          <fpage>23821205241238641</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/10.1177/23821205241238641?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/23821205241238641</pub-id>
          <pub-id pub-id-type="medline">38487300</pub-id>
          <pub-id pub-id-type="pii">10.1177_23821205241238641</pub-id>
          <pub-id pub-id-type="pmcid">PMC10938614</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of ChatGPT-generated medical responses: a systematic review and meta-analysis</article-title>
          <source>J Biomed Inform</source>
          <year>2024</year>
          <volume>151</volume>
          <fpage>104620</fpage>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2024.104620</pub-id>
          <pub-id pub-id-type="medline">38462064</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(24)00038-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McInnes</surname>
              <given-names>MDF</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Thombs</surname>
              <given-names>BD</given-names>
            </name>
            <name name-style="western">
              <surname>McGrath</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PM</given-names>
            </name>
            <collab>the PRISMA-DTA Group</collab>
            <name name-style="western">
              <surname>Clifford</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Deeks</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Gatsonis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hooft</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hunt</surname>
              <given-names>HA</given-names>
            </name>
            <name name-style="western">
              <surname>Hyde</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Korevaar</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Leeflang</surname>
              <given-names>MMG</given-names>
            </name>
            <name name-style="western">
              <surname>Macaskill</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Reitsma</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Rodin</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rutjes</surname>
              <given-names>AWS</given-names>
            </name>
            <name name-style="western">
              <surname>Salameh</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Stevens</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Takwoingi</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tonelli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Weeks</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Willis</surname>
              <given-names>BH</given-names>
            </name>
          </person-group>
          <article-title>Preferred reporting items for a systematic review and meta-analysis of diagnostic test accuracy studies: the PRISMA-DTA statement</article-title>
          <source>JAMA</source>
          <year>2018</year>
          <volume>319</volume>
          <issue>4</issue>
          <fpage>388</fpage>
          <lpage>396</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2017.19163</pub-id>
          <pub-id pub-id-type="medline">29362800</pub-id>
          <pub-id pub-id-type="pii">2670259</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ouzzani</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hammady</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fedorowicz</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Elmagarmid</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Rayyan-a web and mobile app for systematic reviews</article-title>
          <source>Syst Rev</source>
          <year>2016</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>210</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-016-0384-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-016-0384-4</pub-id>
          <pub-id pub-id-type="medline">27919275</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-016-0384-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC5139140</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>PF</given-names>
            </name>
            <name name-style="western">
              <surname>Rutjes</surname>
              <given-names>AWS</given-names>
            </name>
            <name name-style="western">
              <surname>Westwood</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Mallett</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Deeks</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Reitsma</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Leeflang</surname>
              <given-names>MMG</given-names>
            </name>
            <name name-style="western">
              <surname>Sterne</surname>
              <given-names>JAC</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PMM</given-names>
            </name>
            <collab>QUADAS-2 Group</collab>
          </person-group>
          <article-title>QUADAS-2: a revised tool for the quality assessment of diagnostic accuracy studies</article-title>
          <source>Ann Intern Med</source>
          <year>2011</year>
          <volume>155</volume>
          <issue>8</issue>
          <fpage>529</fpage>
          <lpage>536</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acpjournals.org/doi/abs/10.7326/0003-4819-155-8-201110180-00009?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.7326/0003-4819-155-8-201110180-00009</pub-id>
          <pub-id pub-id-type="medline">22007046</pub-id>
          <pub-id pub-id-type="pii">155/8/529</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heath</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>All the news from OpenAI's first developer conference</article-title>
          <source>The Verge</source>
          <year>2023</year>
          <access-date>2024-04-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.theverge.com/2023/11/6/23948619/openai-chatgpt-devday-developer-conference-news">https://www.theverge.com/2023/11/6/23948619/openai-chatgpt-devday-developer-conference-news</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>RSY</given-names>
            </name>
            <name name-style="western">
              <surname>Ming</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Raja Ali</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>The intersection of ChatGPT, clinical medicine, and medical education</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e47274</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e47274/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/47274</pub-id>
          <pub-id pub-id-type="medline">37988149</pub-id>
          <pub-id pub-id-type="pii">v9i1e47274</pub-id>
          <pub-id pub-id-type="pmcid">PMC10698645</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Battineni</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Baldoni</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chintalapudi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Sagaro</surname>
              <given-names>GG</given-names>
            </name>
            <name name-style="western">
              <surname>Pallotta</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nittari</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Amenta</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Factors affecting the quality and reliability of online health information</article-title>
          <source>Digit Health</source>
          <year>2020</year>
          <volume>6</volume>
          <fpage>2055207620948996</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.sagepub.com/doi/abs/10.1177/2055207620948996?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1177/2055207620948996</pub-id>
          <pub-id pub-id-type="medline">32944269</pub-id>
          <pub-id pub-id-type="pii">10.1177_2055207620948996</pub-id>
          <pub-id pub-id-type="pmcid">PMC7466903</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Bhasuran</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hanna</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shavor</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Arguello</surname>
              <given-names>LG</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Quality of answers of generative large language models versus peer users for interpreting laboratory test results for lay patients: evaluation study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e56655</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e56655/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/56655</pub-id>
          <pub-id pub-id-type="medline">38630520</pub-id>
          <pub-id pub-id-type="pii">v26i1e56655</pub-id>
          <pub-id pub-id-type="pmcid">PMC11063893</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Assessment of the capacity of ChatGPT as a self-learning tool in medical pharmacology: a study using MCQs</article-title>
          <source>BMC Med Educ</source>
          <year>2023</year>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>864</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmededuc.biomedcentral.com/articles/10.1186/s12909-023-04832-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12909-023-04832-x</pub-id>
          <pub-id pub-id-type="medline">37957666</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12909-023-04832-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC10644619</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Embracing ChatGPT for medical education: exploring its impact on doctors and medical students</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e52483</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e52483/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/52483</pub-id>
          <pub-id pub-id-type="medline">38598263</pub-id>
          <pub-id pub-id-type="pii">v10i1e52483</pub-id>
          <pub-id pub-id-type="pmcid">PMC11043925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Jered McInerney</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Cole</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Malin</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Peleg</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wallace</surname>
              <given-names>BC</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Leveraging generative AI for clinical evidence synthesis needs to ensure trustworthiness</article-title>
          <source>J Biomed Inform</source>
          <year>2024</year>
          <volume>153</volume>
          <fpage>104640</fpage>
          <pub-id pub-id-type="doi">10.1016/j.jbi.2024.104640</pub-id>
          <pub-id pub-id-type="medline">38608915</pub-id>
          <pub-id pub-id-type="pii">S1532-0464(24)00058-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Abd-Alrazaq</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>AlSaad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alhuwail</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmed</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Healy</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Latifi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Aziz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Damseh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Alabed Alrazak</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sheikh</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medical education: opportunities, challenges, and future directions</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48291</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48291/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48291</pub-id>
          <pub-id pub-id-type="medline">37261894</pub-id>
          <pub-id pub-id-type="pii">v9i1e48291</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273039</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
