<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e65146</article-id>
      <article-id pub-id-type="pmid">39919278</article-id>
      <article-id pub-id-type="doi">10.2196/65146</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Unveiling GPT-4V's hidden challenges behind high accuracy on USMLE questions: Observational Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kim</surname>
            <given-names>Su-Hwan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Thies</surname>
            <given-names>Bill</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Heston</surname>
            <given-names>Thomas F</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Yang</surname>
            <given-names>Zhichao</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2797-4257</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Yao</surname>
            <given-names>Zonghai</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5707-8410</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Tasmin</surname>
            <given-names>Mahbuba</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1884-8838</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Vashisht</surname>
            <given-names>Parth</given-names>
          </name>
          <degrees>MS</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-5556-7197</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Jang</surname>
            <given-names>Won Seok</given-names>
          </name>
          <degrees>RN, MS</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0001-5439-7299</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Ouyang</surname>
            <given-names>Feiyun</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7061-7351</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Beining</given-names>
          </name>
          <degrees>BS</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-5209-4848</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>McManus</surname>
            <given-names>David</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9343-6203</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Berlowitz</surname>
            <given-names>Dan</given-names>
          </name>
          <degrees>MD, MPH</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-8783-5611</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Yu</surname>
            <given-names>Hong</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution/>
            <institution>Miner School of Computer &#38; Information Sciences</institution>
            <institution>University of Massachusetts Lowell</institution>
            <addr-line>1 University Ave</addr-line>
            <addr-line>Lowell, MA, 01854</addr-line>
            <country>United States</country>
            <phone>1 508 612 7292</phone>
            <email>Hong_Yu@uml.edu</email>
          </address>
          <xref rid="aff6" ref-type="aff">6</xref>
          <xref rid="aff7" ref-type="aff">7</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9263-5035</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>College of Information and Computer Science</institution>
        <institution>University of Massachusetts Amherst</institution>
        <addr-line>Amherst, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Miner School of Computer &#38; Information Sciences</institution>
        <institution>University of Massachusetts Lowell</institution>
        <addr-line>Lowell, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Shanghai Medical College</institution>
        <institution>Fudan University</institution>
        <addr-line>Shanghai</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Medicine</institution>
        <institution>University of Massachusetts Chan Medical School</institution>
        <addr-line>Worcester, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Public Health</institution>
        <institution>University of Massachusetts Lowell</institution>
        <addr-line>Lowell, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Center for Biomedical and Health Research in Data Sciences</institution>
        <institution>University of Massachusetts Lowell</institution>
        <addr-line>Lowell, MA</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Center for Healthcare Organization and Implementation Research</institution>
        <institution>VA Bedford Health Care System</institution>
        <addr-line>Bedford, MA</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Hong Yu <email>Hong_Yu@uml.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>7</day>
        <month>2</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e65146</elocation-id>
      <history>
        <date date-type="received">
          <day>20</day>
          <month>8</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>16</day>
          <month>10</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>6</day>
          <month>11</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>26</day>
          <month>11</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Zhichao Yang, Zonghai Yao, Mahbuba Tasmin, Parth Vashisht, Won Seok Jang, Feiyun Ouyang, Beining Wang, David McManus, Dan Berlowitz, Hong Yu. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 07.02.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e65146" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Recent advancements in artificial intelligence, such as GPT-3.5 Turbo (OpenAI) and GPT-4, have demonstrated significant potential by achieving good scores on text-only United States Medical Licensing Examination (USMLE) exams and effectively answering questions from physicians. However, the ability of these models to interpret medical images remains underexplored.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to comprehensively evaluate the performance, interpretability, and limitations of GPT-3.5 Turbo, GPT-4, and its successor, GPT-4 Vision (GPT-4V), specifically focusing on GPT-4V’s newly introduced image-understanding feature. By assessing the models on medical licensing examination questions that require image interpretation, we sought to highlight the strengths and weaknesses of GPT-4V in handling complex multimodal clinical information, thereby exposing hidden flaws and providing insights into its readiness for integration into clinical settings.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>This cross-sectional study tested GPT-4V, GPT-4, and ChatGPT-3.5 Turbo on a total of 227 multiple-choice questions with images from USMLE Step 1 (n=19), Step 2 clinical knowledge (n=14), Step 3 (n=18), the Diagnostic Radiology Qualifying Core Exam (DRQCE) (n=26), and AMBOSS question banks (n=150). AMBOSS provided expert-written hints and question difficulty levels. GPT-4V’s accuracy was compared with 2 state-of-the-art large language models, GPT-3.5 Turbo and GPT-4. The quality of the explanations was evaluated by choosing human preference between an explanation by GPT-4V (without hint), an explanation by an expert, or a tie, using 3 qualitative metrics: comprehensive explanation, question information, and image interpretation. To better understand GPT-4V’s explanation ability, we modified a patient case report to resemble a typical “curbside consultation” between physicians.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>For questions with images, GPT-4V achieved an accuracy of 84.2%, 85.7%, 88.9%, and 73.1% in Step 1, Step 2 clinical knowledge, Step 3 of USMLE, and DRQCE, respectively. It outperformed GPT-3.5 Turbo (42.1%, 50%, 50%, 19.2%) and GPT-4 (63.2%, 64.3%, 66.7%, 26.9%). When GPT-4V answered correctly, its explanations were nearly as good as those provided by domain experts from AMBOSS. However, incorrect answers often had poor explanation quality: 18.2% (10/55) contained inaccurate text, 45.5% (25/55) had inference errors, and 76.3% (42/55) demonstrated image misunderstandings. With human expert assistance, GPT-4V reduced errors by an average of 40% (22/55). GPT-4V accuracy improved with hints, maintaining stable performance across difficulty levels, while medical student performance declined as difficulty increased. In a simulated curbside consultation scenario, GPT-4V required multiple specific prompts to interpret complex case data accurately.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>GPT-4V achieved high accuracy on multiple-choice questions with images, highlighting its potential in medical assessments. However, significant shortcomings were observed in the quality of explanations when questions were answered incorrectly, particularly in the interpretation of images, which could not be efficiently resolved through expert interaction. These findings reveal hidden flaws in the image interpretation capabilities of GPT-4V, underscoring the need for more comprehensive evaluations beyond multiple-choice questions before integrating GPT-4V into clinical settings.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>artificial intelligence</kwd>
        <kwd>natural language processing</kwd>
        <kwd>large language model</kwd>
        <kwd>LLM</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>GPT</kwd>
        <kwd>GPT-4V</kwd>
        <kwd>USMLE</kwd>
        <kwd>Medical License Exam</kwd>
        <kwd>medical image interpretation</kwd>
        <kwd>United States Medical Licensing Examination</kwd>
        <kwd>NLP</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Using computers to help make clinical diagnoses and guide treatments has been a goal of artificial intelligence (AI) since its inception [<xref ref-type="bibr" rid="ref1">1</xref>]. The adoption of electronic health record systems by hospitals in the United States has resulted in an unprecedented amount of digital data associated with patient encounters. Computer-assisted clinical diagnostic support systems (CDSSs) endeavor to enhance clinicians’ decisions with patient information and clinical knowledge [<xref ref-type="bibr" rid="ref2">2</xref>]. There is burgeoning interest in CDSS for enhanced imaging [<xref ref-type="bibr" rid="ref3">3</xref>] in various disciplines such as breast cancer detection [<xref ref-type="bibr" rid="ref4">4</xref>], COVID detection [<xref ref-type="bibr" rid="ref5">5</xref>], diagnosing congenital cataracts [<xref ref-type="bibr" rid="ref6">6</xref>], and hidden fracture location [<xref ref-type="bibr" rid="ref7">7</xref>]. For a decision to be trustworthy for clinicians, CDSS should not only make the prediction but also provide accurate explanations [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. However, most previous imaging CDSSs only highlight areas deemed significant by AI [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref14">14</xref>], providing limited insight into the explanation of the diagnosis [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      <p>Recent advances in large language models (LLMs) have encouraged much discussion in health care. State-of-the-art LLMs include GPT-3.5 Turbo, a chatbot released by OpenAI in October 2022, and its successor, GPT-4, released in March 2023. The success of GPT-3.5 Turbo and GPT-4 is attributed to their conversational ability and their performance, which have approached or matched human-level competence in cognitive tasks, spanning various domains including medicine [<xref ref-type="bibr" rid="ref16">16</xref>]. Both GPT-3.5 Turbo and GPT-4 have achieved commendable results in the United States Medical Licensing Examination (USMLE), leading to discussions about the readiness of LLM applications for integration into clinical [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>] and educational [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>] environments.</p>
      <p>One limitation of GPT-3.5 Turbo and GPT-4 is that they can only read and generate text and are unable to process other data modalities, such as images. This limitation, known as “single modality,” is a common issue among many LLMs [<xref ref-type="bibr" rid="ref23">23</xref>]. Advancements in multimodal LLMs promise enhanced capabilities and integration with diverse data sources [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. OpenAI’s GPT-4V is a state-of-the-art multimodal LLM equipped with image processing and understanding ability [<xref ref-type="bibr" rid="ref27">27</xref>]. However, the ability of GPT-4V to answer medical questions with images with explanations has not been comprehensively evaluated. In this study, we aimed to expose hidden flaws in GPT-4V’s ability to interpret clinical images by thoroughly evaluating its performance on medical licensing examination questions involving image interpretation. For GPT-4V to be useful to medical professionals, it should not only provide correct responses but also offer accurate explanations for its reasoning, especially in complex multimodal clinical scenarios [<xref ref-type="bibr" rid="ref28">28</xref>].</p>
    </sec>
    <sec sec-type="method">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>This cross-sectional study aimed to expose the hidden flaws of GPT-4V in clinical image interpretation by comparing the performance between GPT-4V, GPT-4, and GPT-3.5 Turbo in answering medical licensing examination questions. This study also investigates the quality of GPT-4V explanation in answering these questions. The overview of the study is shown in <xref rid="figure1" ref-type="fig">Figure 1</xref>. This study was conducted in October 2023.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>A summary of the image question selection process and prompt to large language models.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e65146_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The requirement for ethical approval and informed consent was waived by the institutional review board at the VA Bedford Health Care System because no patient data were used. The experiments were performed in accordance with the Declaration of Helsinki.</p>
      </sec>
      <sec>
        <title>Medical Exams and a Patient Case Report Collection</title>
        <p>We obtained study questions from 3 sources. USMLE consists of 3 steps required to obtain a medical license in the United States. USMLE assesses a physician’s ability to apply knowledge, concepts, and principles, which is critical to both health and disease management and is the foundation for safe, efficient patient care. Step 1 assesses foundational scientific concepts essential for medical practice, Step 2 clinical knowledge (CK) evaluates the application of clinical science for supervised patient care, and Step 3 tests the medical knowledge required for unsupervised practice. Step 1, Step 2 CK, and Step 3 of the USMLE sample exam released from the National Board of Medical Examiners consist of 119, 120, and 137 questions respectively. We accessed these questions from publicly available links [<xref ref-type="bibr" rid="ref29">29</xref>]. Each question contained multiple options to choose from. We then selected all questions with images, resulting in 19, 14, and 18 questions from Step 1, Step 2 CK, and Step 3. Medical subdomains include but are not limited to radiology, dermatology, orthopedics, ophthalmology, cardiology, and general surgery.</p>
        <p>The sample exam only included limited questions with images. Thus, we further collected similar questions from AMBOSS, a widely used question bank for medical students, which provides students’ performance on the exam. The performance enabled us to assess the comparative effectiveness of the model. For each question, AMBOSS associated an expert-written hint to tip the student to answer the question and a difficulty level that ranges from 1-5. Levels 1, 2, 3, 4, and 5 represent the easiest 20%, 20%-50%, 50%-80%, 80%-95%, and 95%-100% of questions respectively [<xref ref-type="bibr" rid="ref30">30</xref>]. Hints are designed to guide students to the correct answer. They are typically formatted as a short paragraph that describes the image. We manually checked that no hint had disclosed the answer directly. In addition to the gold standard choice, each answer is associated with a detailed explanation by AMBOSS. They were developed through an internal peer-review process involving more than 50 physicians who achieved high scores in the exam. We used a commercial license to access the questions. Since AMBOSS is not publicly available and its licensing terms restrict the automatic website scraping of its proprietary content, they are not in the CommonCrawl data set used to train GPTs [<xref ref-type="bibr" rid="ref31">31</xref>]. We randomly selected and manually downloaded 10 questions from each of the 5 difficulty levels. We repeated this process for Step 1, Step 2 CK, and Step 3. This resulted in a total number of 150 questions.</p>
        <p>In addition, we collected questions from the Diagnostic Radiology Qualifying Core Exam (DRQCE) [<xref ref-type="bibr" rid="ref32">32</xref>], which is an image-rich exam to evaluate a candidate’s foundational knowledge and clinical judgment across practice domains of diagnostic radiology, which is offered after 36 months of residency training. Since DRQCE is proprietary, we used a commercial license to access the 26 questions with images of 54 questions in the preparation exam offered by the American Board of Radiology. In total, we had 227 questions with images from the 3 aforementioned sources.</p>
        <p>To illustrate GPT-4V’s potential as an imaging diagnostic support tool and further expose its limitations, we used part of a patient case report [<xref ref-type="bibr" rid="ref33">33</xref>] to resemble a typical “curbside consultation” between medical professionals [<xref ref-type="bibr" rid="ref34">34</xref>]. In this case, the patient’s admission info, such as history of present illness, labs, and images of the case report will be presented to both a physician and GPT-4V. The physician can then work with GPT-4V through question answering, for example, by asking GPT-4V to help interpret images, for the final clinical diagnosis.</p>
      </sec>
      <sec>
        <title>How to Answer Image Questions Using GPT-4V Prompts</title>
        <p>GPT-4V took image and text data as inputs to generate textual outputs. Given that input format (prompt) played a key role in optimizing model performance, we followed the standard prompting guidelines of the visual question-answering task [<xref ref-type="bibr" rid="ref35">35</xref>]. Specifically, we prompted GPT-4V by first adding the image, then appending context (ie, patient information) and questions, and finally providing multiple-choice options, each separated by a new line. An example user prompt and GPT-4V response are shown in Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. When multiple subimages existed in the image, we uploaded multiple subimages to GPT-4V. We did not append a hint to the end of the question, unless other specified. The response consists of the selected option as an answer, supported by a textual explanation to substantiate the selected decision. When using GPT-3.5 Turbo and GPT-4 models that cannot handle image data, images were omitted from the prompt. These models were accessed through OpenAI application programming interfaces. Responses were collected from the September 25, 2023, version of models.</p>
      </sec>
      <sec>
        <title>Evaluation Metrics</title>
        <p>For answer accuracy, we evaluated the model’s performance by comparing the model’s choice with the correct choice provided by the exam board or question bank website. We defined accuracy as the ratio of the number of correct choices to the total number of questions.</p>
        <p>We also evaluated the quality of the explanation by preference from 3 health care professionals (1 medical doctor with 35 years of experience in internal medicine, 1 registered ward nurse with 2 years of experience, and 1 third-year medical school student). For each question from the AMBOSS data set (n=150), we first asked the health care professionals to choose their preference between an explanation by GPT-4V (without hint), an explanation by an expert, or a tie without knowing the correctness of GPT-4V’s answers. The exclusion of correctness is to avoid bias in their preference of explanations. In addition, the source of the explanations was blinded to the health care professionals, ensuring that their judgments were not influenced by knowing whether an explanation came from GPT-4V or an expert.</p>
        <p>In addition, we also asked health care professionals to evaluate the GPT-4V explanation from a sufficient and comprehensive perspective [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. They determined if the information exists in the explanation, that consists of (1) image interpretation: GPT-4V tried to interpret the image in the explanation, and such interpretation is sufficient to support its choice; (2) question information: explanations contained information related to the textual context (ie, patient information) of the question, and such information was essential for GPT-4V’s choice; (3) comprehensive explanation: the explanation included comprehensive reasoning for all possible evidence (eg, symptoms, lab results) that leads to the final answer.</p>
        <p>Finally, for each question answered incorrectly, we asked health care professionals to check if the explanation contained any errors that consisted of (1) image misunderstanding (if the sentence in the explanation showed an incorrect interpretation of the image; eg, GPT-4V said that a bone in the image was for the hand, but it was in fact the foot); (2) text hallucination (if the sentence in the explanation contained made-up information [<xref ref-type="bibr" rid="ref38">38</xref>]; eg, claiming Saxenda was insulin); (3) reasoning error (if the sentence did not properly infer knowledge in either image or text to an answer; eg, GPT-4V reasoned that a patient took a trip within the last 3 months and therefore diagnosed the patient as having Chagas disease, despite the clinical knowledge that Chagas disease usually develops 10-20 years after infection); or (4) nonmedical error (GPT is known to struggle with tasks requiring precise spatial localization, such as identifying chess positions on the board [<xref ref-type="bibr" rid="ref27">27</xref>]).</p>
        <p>In this study, we asked an internal medicine doctor with 35 years of experience to articulate a detailed rating guideline above. Our study has shown that the medical student and nurse, both of whom participated independently, agreed with the doctor’s ratings of 95% and 86%, respectively. This high agreement ratio underscores the effectiveness of the standardized guidelines in ensuring consistent evaluation across varying levels of expertise.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Chi-square tests and pairwise comparisons with Bonferroni corrections were used for the performance metrics of GPT-3.5 Turbo, GPT-4, and GPT-4V on visual question answering exams. GPT-4V’s accuracies on the AMBOSS data set were compared between different difficulties using unpaired chi-square tests with a significance level of 0.05. All analysis was conducted in Python software (version 3.10.11; Python Software Foundation).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Overall Answer Accuracy</title>
        <p>For all questions in the USMLE sample exam (including ones without images), GPT-4V achieved an accuracy of 88.2%, 90.8%, and 92.7% among Step 1, Step 2 CK, and Step 3 of USMLE questions, respectively. In comparison, GPT-3.5 Turbo and GPT-4 achieved an accuracy of 55.1% and 81.5% in Step 1, 59.1% and 80.8% in Step 2 CK, and 60.9% and 88.3% in Step 3, respectively (<xref ref-type="table" rid="table1">Table 1</xref>). GPT-4V outperformed GPT-4 and GPT-3.5 Turbo by 11.3% (95% CI 11.5%-11.1%; <italic>P</italic>&#60;.001) and 32% (95% CI 32.3%-31.7%; <italic>P</italic>&#60;.001). The score of GPT-4V passes the standard for the USMLE (about 60%). The performance of GPT-4V across different subdomains is shown in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Performance of GPT-3.5 Turbo, GPT-4, and GPT-4V on a USMLE sample exam from the National Board of Medical Examiners without hints.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="460"/>
            <col width="290"/>
            <col width="220"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Exam name and agents</td>
                <td colspan="2">Performance</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Questions with image, n (%)</td>
                <td>All questions, n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="4">
                  <bold>USMLE<sup>a</sup> sample exam-Step 1<sup>b</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sample size</td>
                <td>19</td>
                <td>119</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-3.5 Turbo</td>
                <td>8 (42.1)</td>
                <td>66 (55.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>12 (63.2)</td>
                <td>97 (81.5)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4V</td>
                <td>16 (84.2)</td>
                <td>105 (88.2)</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>USMLE sample exam-Step 2 clinical knowledge<sup>c</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sample size</td>
                <td>14</td>
                <td>120</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-3.5 Turbo</td>
                <td>7 (50)</td>
                <td>71 (59.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>9 (64.3)</td>
                <td>97 (80.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4V</td>
                <td>12 (85.7)</td>
                <td>109 (90.8)</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>USMLE sample exam-Step 3<sup>d</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sample size</td>
                <td>18</td>
                <td>137</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-3.5 Turbo</td>
                <td>9 (50)</td>
                <td>73 (60.9)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>12 (66.7)</td>
                <td>121 (88.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4V</td>
                <td>16 (88.9)</td>
                <td>127 (92.7)</td>
              </tr>
              <tr valign="top">
                <td colspan="4">
                  <bold>DRQCE<sup>e</sup> sample exam<sup>f</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sample size</td>
                <td>26</td>
                <td>54</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-3.5 Turbo</td>
                <td>5 (19.2)</td>
                <td>31 (57.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4</td>
                <td>7 (26.9)</td>
                <td>35 (64.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>GPT-4V</td>
                <td>19 (73.1)</td>
                <td>48 (88.9)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>USMLE: United States Medical Licensing Examination.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>19 questions with images and 119 questions in total in Step 1.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>14 questions with images and 120 questions in total in Step 2 CK.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>There were 18 questions with images and 137 questions in total in Step 3.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>DRQCE: Diagnostic Radiology Qualifying Core Exam.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>There were 26 questions with images and 54 questions in total in DRQCE.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>For questions with images, GPT-4V achieved an accuracy of 84.2%, 85.7%, and 88.9% in Step 1, Step 2 CK, and Step 3 of USMLE questions, respectively. It outperformed GPT-3.5 Turbo and GPT-4 by 42.1% (8/19; 95% CI 36.8%-47.4%; <italic>P</italic>&#60;.001) and 21.1% (4/19; 95% CI 7.8-34.2%; <italic>P</italic>=0.01) in Step 1, 35.7% (5/14; 95% CI 3.1%-39.7%; <italic>P</italic>=.03) and 21.4% (3/14; 95% CI 4.7%-38.1%; <italic>P</italic>=.02) in Step 2 CK, 38.9% (7/18; 95% CI 32.2%-45.7%; <italic>P</italic>&#60;.001) and 22.2% (4/18; 95% CI 5.5%-38.9%; <italic>P</italic>=.02) in Step 3, respectively. Similarly, GPT-4V achieved an accuracy of 73.1%, outperforming GPT-3.5 Turbo by 53.9% (14/26; 95% CI 41.6%-66.2%; <italic>P</italic>&#60;.001) and GPT-4 by 46.2% (12/26; 95% CI 29.8%-62.5%; <italic>P</italic>&#60;.001) in DRQCE (<xref ref-type="table" rid="table1">Table 1</xref>). This highlights the superior ability of GPT-4V to interpret clinical images compared with earlier versions.</p>
      </sec>
      <sec>
        <title>Impact of Difficulty Level and Use of Hints</title>
        <p>When asking GPT-4V questions without a hint, it achieved an accuracy of 60%, 64%, and 66% for AMBOSS Step 1, Step 2 CK, and Step 3, respectively (<xref ref-type="table" rid="table2">Table 2</xref>). GPT-4V was in the 72nd, 76th, and 80th percentile with AMBOSS users who were preparing for Step 1, Step 2 CK, and Step 3, respectively. When asking GPT-4V questions with a hint, it achieved an accuracy of 84%, 86%, and 88% for AMBOSS Step 1, Step 2 CK, and Step 3, respectively. Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> is an example where GPT-4V switched the answer from incorrect to correct when a hint was provided. GPT-4V predictions on the entire AMBOSS data set with images are reported in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> (n=646). Conclusions drawn from automatic evaluation align with our findings presented in <xref ref-type="table" rid="table2">Table 2</xref> (n=150).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Performance of GPT-4V on AMBOSS.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="260"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="140"/>
            <col width="0"/>
            <col width="110"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="100"/>
            <col width="0"/>
            <col width="90"/>
            <thead>
              <tr valign="top">
                <td colspan="3">AMBOSS steps and hint availability</td>
                <td colspan="11">GPT-4V accuracy on AMBOSS, %</td>
              </tr>
              <tr valign="top">
                <td colspan="3">
                  <break/>
                </td>
                <td colspan="2">Overall (n=50)</td>
                <td colspan="2">1 (n=10)</td>
                <td colspan="2">2 (n=10)</td>
                <td colspan="2">3 (n=10)</td>
                <td colspan="2">4 (n=10)</td>
                <td>5 (n=10)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="14">
                  <bold>Step 1</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Without hint</td>
                <td colspan="2">60</td>
                <td colspan="2">70</td>
                <td colspan="2">70</td>
                <td colspan="2">30</td>
                <td colspan="2">70</td>
                <td colspan="2">60</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Expert hint</td>
                <td colspan="2">84</td>
                <td colspan="2">80</td>
                <td colspan="2">80</td>
                <td colspan="2">80</td>
                <td colspan="2">90</td>
                <td colspan="2">90</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Step 2 clinical knowledge</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Without hint</td>
                <td colspan="2">64</td>
                <td colspan="2">80</td>
                <td colspan="2">70</td>
                <td colspan="2">70</td>
                <td colspan="2">50</td>
                <td colspan="2">50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Expert hint</td>
                <td colspan="2">86</td>
                <td colspan="2">100</td>
                <td colspan="2">90</td>
                <td colspan="2">100</td>
                <td colspan="2">70</td>
                <td colspan="2">70</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Step 3</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Without hint</td>
                <td colspan="2">66</td>
                <td colspan="2">80</td>
                <td colspan="2">90</td>
                <td colspan="2">60</td>
                <td colspan="2">50</td>
                <td colspan="2">50</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Expert hint</td>
                <td colspan="2">88</td>
                <td colspan="2">90</td>
                <td colspan="2">90</td>
                <td colspan="2">90</td>
                <td colspan="2">90</td>
                <td colspan="2">80</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> shows a decreasing trend in GPT-4V’s performance in the AMBOSS data set when the difficulty of questions increased (<italic>P</italic>=.04) without a hint. However, with the hint, the performance of GPT-4V plateaued across 5 difficulty levels. Importantly, the accuracies of both GPT-4V, with or without a hint, in general, outperformed the accuracies of medical students, and the gap between the performance of GPT-4V and medical students increased when the difficulty increased. On the most difficult questions, GPT-4V with hint outperformed medical students by 60% (18/30, 95% CI 56.8%-63.1%; <italic>P</italic>&#60;.001), and GPT-4V without hint outperformed medical students by 26.7% (8/30, 95% CI 24.2%-29.3%; <italic>P</italic>&#60;.001). The findings show that while GPT-4V outperforms medical students in accuracy, its performance is largely dependent on context-based hints, reflecting a fundamental flaw in image reasoning.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Performance of GPT-4V and students on 150 AMBOSS questions with different difficulty levels.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e65146_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Quality of Explanation</title>
        <p>We evaluated the user’s preference among GPT-4V-generated explanations and expert-generated explanations. When GPT-4V answered incorrectly, our results show that health care professionals overwhelmingly preferred expert explanations as shown in <xref ref-type="table" rid="table3">Table 3</xref>. In total, 47 preferred experts and 0 preferred GPT-4V. When GPT-4V answered correctly, the quality of GPT-4V-generated explanations was close to expert-generated explanations: out of 95 votes, 19 preferred experts, 15 preferred GPT-4V, and 61 preferred either. The preference for expert explanations in incorrect answers highlights key weaknesses in GPT-4V’s ability to interpret clinical images accurately and offer dependable reasoning.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Health care professionals preferred explanations for 150 AMBOSS questions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="350"/>
            <col width="250"/>
            <col width="200"/>
            <col width="170"/>
            <thead>
              <tr valign="top">
                <td colspan="2">AMBOSS steps and correctness of GPT-4V (without hint) responses</td>
                <td colspan="3">Health care professionals’ preference</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Prefer expert</td>
                <td>Ties</td>
                <td>Prefer GPT-4V</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="5">
                  <bold>Step 1</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Correct</td>
                <td>4</td>
                <td>23</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Incorrect</td>
                <td>16</td>
                <td>4</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Step 2 clinical knowledge</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Correct</td>
                <td>10</td>
                <td>15</td>
                <td>7</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Incorrect</td>
                <td>18</td>
                <td>0</td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td colspan="5">
                  <bold>Step 3</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Correct</td>
                <td>5</td>
                <td>23</td>
                <td>5</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Incorrect</td>
                <td>13</td>
                <td>4</td>
                <td>0</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>We further evaluated the quality of the GPT-4V generated explanation by verifying if the explanation includes image and question text interpretation in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. When examining the 95 correct answers, 84.2% (n=80) of the responses contained an interpretation of the image, while 96.8% (n=92) aptly captured the information presented in the question. On the other hand, for the 55 incorrect answers, 92.8% (n=51) interpreted the image, and 89.1% (n=49) depicted the question’s details. In terms of comprehensiveness, GPT-4V offered a comprehensive explanation in 79% (n=75) of correct responses. In contrast, only 7.2% (n=4) of the wrong responses had a comprehensive explanation that led to the GPT-4V’s choice.</p>
        <p>We also evaluated the explanations of incorrect responses by GPT-4V image and grouped them into 4 categories, that are image misunderstanding, text hallucination, reasoning error, and nonmedical error. Among GPT-4V responses with wrong answers (n=55), we found that 76.3% (n=42) of responses included misunderstanding of the image, 45.5% (n=25) of responses included logic error, 18.2% (n=10) of responses included text hallucination, and no responses included nonmedical errors.</p>
      </sec>
      <sec>
        <title>A Case Study of Curbside Consultation</title>
        <p>We present a clinical case study involving a 45-year-old woman with hypertension and altered mental status. As shown in Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, a collaborative design of GPT-4V allows communication between GPT-4V and physicians. In this scenario, when asked to interpret a CT scan, GPT-4V initially provided an irrelevant answer. GPT-4V needed 5 additional physician-guided prompts to list potential diagnoses, including primary aldosteronism, hypertension, and Cushing’s syndrome. For instance, when the physician specifically prompted, “If I suspect Cushing’s syndrome due to ectopic ACTH secretion, what would be the next steps to evaluate this patient to determine the source of the hormonal abnormality?” and pointed to a specific area on the CT scan, GPT-4V was then able to respond correctly. This interaction indicates that GPT-4V struggles to autonomously interpret medical images, requiring continuous and specific prompts for accurate interpretation, which underscores its flaws in independent image reasoning.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Recent advancements in medical question-answering systems have leveraged domain-specific transformer models. Early models such as PubMedBERT [<xref ref-type="bibr" rid="ref39">39</xref>] with 100 million parameters score around 38.3% in USMLE. The introduction of larger models marked a substantial improvement. JMLR [<xref ref-type="bibr" rid="ref40">40</xref>] with 13 billion parameters, Med-Palm [<xref ref-type="bibr" rid="ref41">41</xref>] with 540 billion parameters, and GPT-4 achieves 62.5%, 86.2%, and 90.2% respectively. However, previous works only tested these models on text-only questions without images [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>] or questions in non-English languages [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Unlike previous works that focus primarily on accuracy [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>], we emphasize explanation quality as a crucial metric for assessing the model’s clinical applicability. In particular, we evaluated GPT-4V’s ability to interpret medical images (a new feature) to highlight hidden flaws in clinical image interpretation.</p>
        <p>We found that GPT-4V outperformed both GPT-3.5 Turbo and GPT-4 (<xref ref-type="table" rid="table1">Table 1</xref>). When evaluating all questions in the USMLE sample exam, GPT-4V achieved an accuracy of 90.7% outperforming GPT-3.5 Turbo (58.5%) and GPT-4 (83.8%). In comparison, medical students can pass the USMLE exam with more than 60% accuracy, indicating that the GPT-4V performed at a level similar to or above a medical student in the final year of study. The accuracy of GPT-4V highlights its grasp over biomedical and clinical sciences, essential for medical practice, and showcases its ability in patient management and problem-solving skills [<xref ref-type="bibr" rid="ref49">49</xref>]. Other studies further demonstrated the potential for clinical routines, such as summarizing radiology reports [<xref ref-type="bibr" rid="ref50">50</xref>] and differential diagnosis [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>].</p>
        <p>For medical exam questions with images, we found that GPT-4V achieved an accuracy of 62%, which was equivalent to the 70th-80th percentile with AMBOSS medical students. This finding indicates that GPT-4V has the capability to integrate information from both text and images to answer questions, making it a promising tool for answering clinical questions based on images. However, our evaluation also reveals hidden flaws in its image interpretation, particularly in its inconsistency and the need for extensive context to provide accurate answers.</p>
        <p>Another important finding is that GPT-4V significantly outperformed medical students for questions considered difficult for the students. Specifically, our results, as shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>, show that while medical students’ performance linearly decreased when the difficulty of questions increased, GPT-4V’s performance stayed relatively stable. When expert hints were provided, GPT-4V’s performance stayed plateau among questions in all difficult levels. This consistent performance indicates that GPT-4V effectively addresses questions that medical students find challenging. Its advanced capabilities suggest potential as an educational assistant, particularly for complex topics. Under the supervision of teachers’ hints, medical students could benefit from its advanced capabilities to understand and analyze complex medical questions.</p>
        <p>There may be multiple factors that contribute to GPT-4V’s performance on difficult questions. Instrument methods (eg, item response theory [<xref ref-type="bibr" rid="ref53">53</xref>]) have been typically used for the construction and evaluation of measurement scales and tests. For example, item response theory uses a statistical model that links an individual person’s responses to individual test items (questions on a test) to the person’s ability to correctly respond to the items and the items’ features. Therefore, medical examination test sets have been specifically selected and tailored to medical students’ performance with the intended distribution where the performance decreases when the difficulty level increases. Although more evaluation is needed to draw the conclusion that GPT-4V substantially outperformed medical students in difficult questions, our results at least show that GPT-4V performed differently.</p>
        <p>On the other hand, we found that GPT-4V’s performance was inconsistent among different medical subdomains. As shown in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, GPT-4V achieved high accuracy on subdomains such as immunology (5/5, 100%), otolaryngology (6/6, 100%), and pulmonology (6/8, 75%), and low accuracy on others such as anatomy (1/4, 25%), emergency medicine (1/4, 25%), and pathology (5/10, 50%). This suggests that while GPT-4V shows potential in some specialties or subdomains, it may require further development to be reliable across the board. The uneven performance highlights the need for tailored approaches to enhancing the model’s capabilities where it falls short.</p>
        <p>Another advantage of GPT-4V is its ability to explain its image content. Previous studies have shown limited use of current CDSS as most of them offered limited decision explanations and thus gained limited trust among physicians (unlike their colleagues) [<xref ref-type="bibr" rid="ref54">54</xref>-<xref ref-type="bibr" rid="ref57">57</xref>]. In contrast, GPT-4V has the potential to improve the effectiveness and credibility of CDSS by providing explanations preferred by experts. As our results indicate, the quality of explanations generated by GPT-4V, when answering correctly, is close to that of expert-generated explanations. Although in more complex scenarios (such as in our curbside consult setting), GPT-4V currently requires continuous highly specialized guidance, which temporarily prevents it from enhancing physician work efficiency, this feature still has the potential to encourage physicians to adopt and use GPT-4V more confidently and broadly.</p>
        <p>In terms of explanation quality, we found that more than 80% of responses from GPT-4V provided an interpretation of the image, regardless of whether the responses were correct or not. This suggests that GPT-4V consistently takes into account the image while generating responses. Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> illustrates an example of a high-quality explanation that uses images to answer a hard question. In this example, more than 70% of students answered incorrectly on the first try, because both bacterial pneumonia and pulmonary embolism may involve symptoms such as cough. To differentiate them, GPT-4V correctly interpreted the x-ray with a radiologic sign of Hampton hump, which further increased the suspicion of pulmonary infarction rather than pneumonia [<xref ref-type="bibr" rid="ref58">58</xref>]. To show the need for an x-ray as mentioned in the explanation, we removed the image from the input, and GPT-4V switched the answer to bacterial pneumonia while also acknowledging the possibility of pulmonary infarction. This change in response demonstrated the high quality of the GPT-4V explanation, as its explanation about x-rays was not fictional and it truly needed the x-ray to answer this question.</p>
        <p>On the other hand, we found that the quality of generated explanations was poor when GPT-4V answered incorrectly. Manual analyses by health care professionals concluded that image misunderstanding was the primary reason why GPT-4V answered incorrectly. Out of 55 wrong responses, 42 (76.3%) were due to misunderstanding of the image. In comparison, only 18.2% (10/55) of the mistakes were attributed to text misinterpretation. Clearly, GPT-4V’s proficiency in processing images was considerably lagging behind its text-handling capability. This gap in capability suggests that GPT-4V’s advancements in image understanding remain nascent and require significant refinement to align with its text analysis capabilities. To circumvent its image interpretation issue, we additionally prompted GPT-4V with a short hint that described the image. We found that 40% (22/55) of responses switched to the correct answer. One potential future direction involves strengthening GPT-4V's domain-specific knowledge by integrating extensive clinical datasets into its training. For example, employing domain-adaptive pretraining methods—such as those used in MEDITRON [<xref ref-type="bibr" rid="ref59">59</xref>], which leverages medical guidelines and specialized clinical corpora—could significantly improve the model’s understanding of medical concepts, leading to more precise and contextually relevant explanations. In addition, incorporating retrieval-augmented generation based on domain-specific corpora [<xref ref-type="bibr" rid="ref40">40</xref>] would enable the model to access and retrieve pertinent clinical information during inference, grounding its explanations in verified data. This could improve factual accuracy and reduce the likelihood of incorrect or unsupported responses. Together, these strategies aim to bolster the model's capacity to provide high-quality, accurate explanations, thereby enhancing its overall reliability and usefulness in clinical applications.</p>
        <p>Creating these image-related hints requires clinical expertise, limiting the use of GPT-4V as a CDSS. In our case study, when GPT-4V delivered an irrelevant response, the physician needed to come up with correct hints for GPT-4V. These findings reveal a key limitation: GPT-4V’s reliance on external guidance from experts to interpret complex image content effectively, thereby exposing its inability to operate independently in clinical scenarios. Efforts improving GPT-4V on images include multimodal LLMs with reinforcement learning from human feedback to align the outputs of LLMs with physicians’ intentions and expectations. This alignment is critical not only for enhancing the accuracy and relevance of the responses but also for integrating GPT-4V seamlessly into clinical environments where time is of the essence [<xref ref-type="bibr" rid="ref60">60</xref>].</p>
        <p>Another significant drawback of GPT-4V involved its tendency to produce factually inaccurate responses, a problem often referred to as the hallucination effect, which is prevalent among many LLMs such as GPT-4V [<xref ref-type="bibr" rid="ref38">38</xref>]. We found that more than 18% of GPT-4V explanations contain hallucinations, potentially misleading or distracting physicians, particularly the less experienced medical students and residents. This finding emphasizes the need for robust evaluation and correction mechanisms to minimize hallucinations, which are critical to ensure GPT-4V’s reliability and safety in clinical practice. One future direction is to integrate GPT-4V and a probabilistic model with CI and citations from credible sources to show the reliability of the response [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. The confidence score could also help prioritize the list of differential diagnoses, making it clearer to the physician which conditions are more probable. Thereby reducing the risk of confusion and enhancing the reliability of the CDSS response when additional physician review is warranted [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has several limitations. First, our findings are constrained in their applicability due to the modest sample size. We gathered 227 questions from a total of 28 subdomains or specialties that included images, which might not comprehensively represent all medical disciplines. The small number of questions in each subdomain may not be sufficient to conclude that GPT-4V’s performance is inconsistent between medical subdomains. Second, the exams used to test GPT-4V are written in English. Future work could explore other languages. Third, the models used for evaluation were from September 2023, and frontier models may have evolved since then, potentially impacting the results. Fourth, we cannot guarantee that OpenAI strictly adhered to licensing terms when determining which content was included or excluded from their training sets; therefore, even though AMBOSS is not publicly available and its licensing terms restrict the automatic website scraping of its proprietary content, GPT may have already seen the data during training, potentially impacting the results. Finally, while GPT-4V has demonstrated proficiency in medical license examination, its CDSS ability remains untested. Future work could explore continued training GPT-4V in the medical domain for better CDSS integration. Medical exams provide options, but such options would rarely be provided by physicians during CDSS. Our study highlights the inherent limitations in GPT-4V’s image interpretation abilities, particularly without expert guidance. We showed that GPT-4V can reduce errors with expert hints, but in more realistic clinical environments, it required continuous highly specialized guidance to make partially correct diagnoses and subsequent examination recommendations, revealing limitations in its autonomous decision-making capabilities. Therefore, more cases with clinician questions should be explored to confirm our findings before clinical integration. Extrapolating the efficacy of GPT-4V to broader clinical applications requires appropriate benchmarks and further research.</p>
        <p>Regarding ethical considerations, deploying AI systems for medical advice poses significant ethical implications, especially in medical education and clinical decision-making. Incorrect AI-generated explanations risk disseminating misinformation that could misguide medical professionals, impacting patient safety and treatment outcomes. This is particularly concerning when AI is used in training settings, as it could shape the decision-making abilities of future healthcare providers in potentially harmful ways. Integrating AI into clinical workflows also raises broader societal concerns. While AI has the potential to enhance healthcare efficiency, it could alter patient care dynamics and physician roles. Overreliance on AI may reduce direct physician-patient communication, eroding trust and undermining the relationship-building essential for effective care. Physicians might also become too dependent on AI, potentially compromising their clinical judgment and their ability to critically assess AI-generated insights. Thus, integrating AI in a manner that complements human expertise (supporting rather than replacing health care providers) is vital. Moreover, current benchmarks, including the one in our study, do not fully assess an AI’s capabilities for real-world clinical decision-making. Although some LLMs perform well on benchmarks, they lack the comprehensive clinical skills and nuanced understanding required to navigate complex medical scenarios effectively. Viewing these AI models as tools that assist rather than replace clinicians is crucial to ensuring their safe and beneficial use in health care. A responsible approach is needed when deploying AI for medical advice, one that ensures ethical standards are maintained. Issues such as privacy, bias, and the broader implications of AI in society must guide the development and implementation of these systems. By enhancing data diversity, ensuring privacy, and fostering a transparent understanding of AI’s role, we can work toward ethical advancements in health care that enhance outcomes without compromising human oversight or patient trust. Future work should focus on developing AI technologies that are fully aligned with health care professionals, maintaining a collaborative and ethically sound approach to their integration.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In this study, GPT-4V demonstrated remarkable overall accuracy on the medical licensing examination and provided high-quality explanations when correct. The evaluation of questions with images (a relatively novel feature for GPT models) allowed us to expose hidden flaws in GPT-4V’s image interpretation abilities, offering a unique insight into its strengths and weaknesses. Its performance on image-related questions ranged from 60% to 88%, while physician misdiagnosis rates can be as high as 40% [<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>]. GPT-4V substantially outperformed medical students on difficult questions, but we observed severe issues in its explanations and reasoning, including hallucinations, errors, and misinterpretations. These findings reveal significant challenges in GPT-4V’s ability to independently interpret and reason through complex image-based questions, which is crucial for clinical applications. Despite its strong performance on multiple-choice questions, GPT-4V may still encounter comprehension or explanation errors. When assisted by human experts, GPT-4V reduced some errors with image-related hints. However, in realistic curbside consult settings, continuous and highly specialized prompting was still required, making it time-consuming and limiting its utility as a clinical decision support system in real-world clinical practice. <xref ref-type="table" rid="table4">Table 4</xref> lists the summary of key findings.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Summary of key findings.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="290"/>
            <col width="710"/>
            <thead>
              <tr valign="top">
                <td>Metric</td>
                <td>Findings</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Accuracy of image-based questions</td>
                <td>GPT-4V achieved 84.2% in Step 1, 85.7% in Step 2 CK<sup>a</sup>, 88.9% in Step 3, and 73.1% in DRQCE<sup>b</sup>, outperforming GPT-3.5 Turbo (42.1%, 50%, 50%, 19.2%) and GPT-4 (63.2%, 64.3%, 66.7%, 26.9%).</td>
              </tr>
              <tr valign="top">
                <td>Explanation quality</td>
                <td>When GPT-4V provided correct answers, its explanations were almost on par with those given by domain experts. However, for incorrect responses, the explanation quality was often lacking: 18.2% included inaccurate information, 45.5% involved inferencing mistakes, and 76.3% reflected misinterpretations of images.</td>
              </tr>
              <tr valign="top">
                <td>The impact of human expert hints</td>
                <td>There is a decreasing trend in GPT-4V’s performance in the AMBOSS dataset when the difficulty of questions increased) without hint. However, with the hint, the performance of GPT-4V plateaued.</td>
              </tr>
              <tr valign="top">
                <td>Performance of GPT-4V on most difficult questions</td>
                <td>GPT-4V with hints outperformed medical students by 60%, and GPT-4V without hints outperformed medical students by 26.7%.</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>CK: clinical knowledge.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>DRQCE: Diagnostic Radiology Qualifying Core Exam.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Overall, our findings emphasize the need for a more comprehensive evaluation of GPT-4V’s multimodal capabilities, especially in clinical image interpretation, before considering its integration into clinical decision support systems. Future randomized clinical trials will help further verify the actual utility of GPT-4V and promote more extensive and profound integration of AI in the medical domain.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplementary figures and tables.</p>
        <media xlink:href="jmir_v27i1e65146_app1.docx" xlink:title="DOCX File , 1884 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CDSS</term>
          <def>
            <p>clinical diagnostic support system</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CK</term>
          <def>
            <p>clinical knowledge</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">DRQCE</term>
          <def>
            <p>Diagnostic Radiology Qualifying Core Exam</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLMs</term>
          <def>
            <p>large language models</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was funded by the grant R01MH125027 from the National Institute of Mental Health of the National Institutes of Health. The funding source had no role in the design and conduct of the study; collection, management, analysis, and interpretation of the data; preparation, review, or approval of the manuscript; and decision to submit the manuscript for publication. The contents of this paper do not represent the views of the National Institutes of Health.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and/or analyzed during this study are available in the USMLE [<xref ref-type="bibr" rid="ref29">29</xref>], AMBOSS [<xref ref-type="bibr" rid="ref30">30</xref>], and DRQCE [<xref ref-type="bibr" rid="ref32">32</xref>] repositories.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>HY initialized the conceptualization of the project. Yang Z and Yao Z designed the study. Yang Z, Yao Z, MT, and PV implemented the methods. Yang Z, Yao Z, WJ, FO, BW, and DB performed the data analysis. Yang Z and Yao Z interpreted the results with substantial input from DM, DB, and HY. All authors contributed to manuscript preparation.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shortliffe</surname>
              <given-names>EH</given-names>
            </name>
            <name name-style="western">
              <surname>Cimino</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <source>Biomedical Informatics: Computer Applications in Health Care and Biomedicine</source>
          <year>2014</year>
          <month>12</month>
          <day>02</day>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutton</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Pincock</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Baumgart</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Sadowski</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Fedorak</surname>
              <given-names>RN</given-names>
            </name>
            <name name-style="western">
              <surname>Kroeker</surname>
              <given-names>KI</given-names>
            </name>
          </person-group>
          <article-title>An overview of clinical decision support systems: benefits, risks, and strategies for success</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>17</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-0221-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0221-y</pub-id>
          <pub-id pub-id-type="medline">32047862</pub-id>
          <pub-id pub-id-type="pii">221</pub-id>
          <pub-id pub-id-type="pmcid">PMC7005290</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lungren</surname>
              <given-names>MP</given-names>
            </name>
          </person-group>
          <article-title>The current and future state of AI interpretation of medical images</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <volume>388</volume>
          <issue>21</issue>
          <fpage>1981</fpage>
          <lpage>1990</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMra2301725</pub-id>
          <pub-id pub-id-type="medline">37224199</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aggarwal</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sounderajah</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ashrafian</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Darzi</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy of deep learning in medical imaging: a systematic review and meta-analysis</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>65</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-021-00438-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00438-z</pub-id>
          <pub-id pub-id-type="medline">33828217</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00438-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC8027892</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>ZQ</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>COVID-Net: a tailored deep convolutional neural network design for detection of COVID-19 cases from chest X-ray images</article-title>
          <source>Sci Rep</source>
          <year>2020</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>19549</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-020-76550-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-020-76550-z</pub-id>
          <pub-id pub-id-type="medline">33177550</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-020-76550-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC7658227</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Long</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>An</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An artificial intelligence platform for the multihospital collaborative management of congenital cataracts</article-title>
          <source>Nat Biomed Eng</source>
          <year>2017</year>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>0024</fpage>
          <pub-id pub-id-type="doi">10.1038/s41551-016-0024</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rayan</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kan</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Annapragada</surname>
              <given-names>AV</given-names>
            </name>
          </person-group>
          <article-title>Binomial classification of pediatric elbow fractures using a deep learning multiview approach emulating radiologist decision making</article-title>
          <source>Radiol Artif Intell</source>
          <year>2019</year>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>e180015</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33937781"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/ryai.2019180015</pub-id>
          <pub-id pub-id-type="medline">33937781</pub-id>
          <pub-id pub-id-type="pmcid">PMC8017418</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bussone</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Stumpf</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>O?Sullivan</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>The role of explanations on trust and reliance in clinical decision support systems</article-title>
          <year>2015</year>
          <conf-name>International Conference on Healthcare Informatics. Published online</conf-name>
          <conf-date>2015 October 23</conf-date>
          <conf-loc>USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/ichi.2015.26</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Panigutti</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Beretta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Giannotti</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Pedreschi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Understanding the impact of explanations on advice-taking: a user study for AI-based clinical Decision Support Systems</article-title>
          <year>2022</year>
          <conf-name>CHI Conference on Human Factors in Computing Systems. Published online 2022</conf-name>
          <conf-date>2022 April 29</conf-date>
          <conf-loc>USA</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3491102.3502104</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gaube</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Suresh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Raue</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lermer</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Koch</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Hudecek</surname>
              <given-names>MFC</given-names>
            </name>
            <name name-style="western">
              <surname>Ackery</surname>
              <given-names>AD</given-names>
            </name>
            <name name-style="western">
              <surname>Grover</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Coughlin</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Frey</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kitamura</surname>
              <given-names>FC</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Colak</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Non-task expert physicians benefit from correct explainable AI advice when reviewing X-rays</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>1383</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-28633-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-28633-w</pub-id>
          <pub-id pub-id-type="medline">36697450</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-28633-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC9876883</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singh,</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mohammed</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Zelek</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lakshminarayanan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Interpretation of deep learning using attributions: application to ophthalmic diagnosis</article-title>
          <source>In: Optical Engineering + Applications</source>
          <year>2020</year>
          <volume>11511</volume>
          <fpage>11</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eitel</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Ritter</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Inuzuki</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Reyes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Syeda-Mahmood</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Testing the robustness of attribution methods for convolutional neural networks in MRI-based alzheimer?s disease classification</article-title>
          <source>Interpretability of Machine Intelligence in Medical Image Computing and Multimodal Learning for Clinical Decision Support. Springer International Publishing</source>
          <year>2019</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Springer International Publishing</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Papanastasopoulos</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Samala</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>HP</given-names>
            </name>
            <name name-style="western">
              <surname>Hadjiiski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Paramagul</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Helvie</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Neal</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <source>Explainable AI for medical imaging: deep-learning CNN ensemble for classification of estrogen receptor status from breast MRI</source>
          <access-date>2020-02-16</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.spiedigitallibrary.org/conference-proceedings-of-spie/11314/2549298/Explainable-AI-for-medical-imaging--deep-learning-CNN-ensemble/10.1117/12.2549298.short">https://www.spiedigitallibrary.org/conference-proceedings-of-spie/11314/2549298/Explainable-AI-for-medical-imaging--deep-learning-CNN-ensemble/10.1117/12.2549298.short</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shamout</surname>
              <given-names>FE</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kaku</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Makino</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Jastrzębski</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Witowski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dogra</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Razavian</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kudlowitz</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Azour</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Moore</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Lui</surname>
              <given-names>YW</given-names>
            </name>
            <name name-style="western">
              <surname>Aphinyanaphongs</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fernandez-Granda</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Geras</surname>
              <given-names>KJ</given-names>
            </name>
          </person-group>
          <article-title>An artificial intelligence system for predicting the deterioration of COVID-19 patients in the emergency department</article-title>
          <source>NPJ Digit Med</source>
          <year>2021</year>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>80</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-021-00453-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-021-00453-0</pub-id>
          <pub-id pub-id-type="medline">33980980</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-021-00453-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC8115328</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Heacock</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elias</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hentel</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Reig</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Shih</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Moy</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and other large language models are double-edged swords</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>307</volume>
          <issue>2</issue>
          <fpage>e230163</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230163</pub-id>
          <pub-id pub-id-type="medline">36700838</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
            <name name-style="western">
              <surname>Achiam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Adler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ahmad</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Akkaya</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Aleman</surname>
              <given-names>FL</given-names>
            </name>
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Altenschmidt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Altman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Anadkat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Avila</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Babuschkin</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Balaji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Balcom</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Baltescu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Bavarian</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Belgum</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bello</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Berdine</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bernadett-Shapiro</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Bogdonoff</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Boiko</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Brakman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Brockman</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 Technical Report</article-title>
          <source>ArXiv. 2023;abs/2303.0</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.08774</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Patrinely</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Donald</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Berkowitz</surname>
              <given-names>ST</given-names>
            </name>
            <name name-style="western">
              <surname>Finn</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Jahangir</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Scoville</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Reese</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Bastarache</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>van der Heijden</surname>
              <given-names>YF</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Choe</surname>
              <given-names>JH</given-names>
            </name>
            <name name-style="western">
              <surname>Chastain</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Zic</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Horst</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Turker</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Osmundson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Idrees</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kiernan</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Padmanabhan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bailey</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Schlegel</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Chambless</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>MK</given-names>
            </name>
            <name name-style="western">
              <surname>Osterman</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Wheless</surname>
              <given-names>LE</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>DB</given-names>
            </name>
          </person-group>
          <article-title>Accuracy and reliability of chatbot responses to physician questions</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>10</issue>
          <fpage>e2336483</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37782499"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.36483</pub-id>
          <pub-id pub-id-type="medline">37782499</pub-id>
          <pub-id pub-id-type="pii">2809975</pub-id>
          <pub-id pub-id-type="pmcid">PMC10546234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Decker</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Trang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ramirez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Colley</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pierce</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Coleman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bongiovanni</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Melton</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Wick</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Large language model-based chatbot vs surgeon-generated informed consent documentation for common procedures</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>10</issue>
          <fpage>e2336997</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37812419"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.36997</pub-id>
          <pub-id pub-id-type="medline">37812419</pub-id>
          <pub-id pub-id-type="pii">2810364</pub-id>
          <pub-id pub-id-type="pmcid">PMC10562939</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Poliak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leas</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Faix</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Hogarth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title>
          <source>JAMA Intern Med</source>
          <year>2023</year>
          <volume>183</volume>
          <issue>6</issue>
          <fpage>589</fpage>
          <lpage>596</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37115527"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id>
          <pub-id pub-id-type="medline">37115527</pub-id>
          <pub-id pub-id-type="pii">2804309</pub-id>
          <pub-id pub-id-type="pmcid">PMC10148230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sanghera</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Barzangi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>El Mukashfi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Trialling a large language model (ChatGPT) in general practice with the applied knowledge test: observational study demonstrating opportunities and limitations in primary care</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e46599</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46599/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46599</pub-id>
          <pub-id pub-id-type="medline">37083633</pub-id>
          <pub-id pub-id-type="pii">v9i1e46599</pub-id>
          <pub-id pub-id-type="pmcid">PMC10163403</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cooper</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>AI and medical education - A 21st-century pandora's box</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <volume>389</volume>
          <issue>5</issue>
          <fpage>385</fpage>
          <lpage>387</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMp2304993</pub-id>
          <pub-id pub-id-type="medline">37522417</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khader</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Müller-Franzes</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tayebi Arasteh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haarburger</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Stegmaier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bressem</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhl</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Nebelung</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kather</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Multimodal deep learning for integrating chest radiographs and clinical parameters: a case for transformers</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>309</volume>
          <issue>1</issue>
          <fpage>e230806</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230806</pub-id>
          <pub-id pub-id-type="medline">37787671</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Usuyama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Hanwen Xu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jaspreet Bagga</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Robert Tinn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sam Preston,</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rajesh Rao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mu Wei</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Naveen Valluri</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Cliff Wong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Andrea Tupini</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yu Wang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Matt Mazzola,</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Swadheen Shukla</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lars Liden,</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Jianfeng Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Matthew P. Lungren</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Tristan Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sheng Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hoifung Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Large-scale domain-specific pretraining for biomedical vision-language processing</article-title>
          <source>ArXiv. 2023;abs/2303.0</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.00915</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Driess</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Schaekermann</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Amin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Carroll</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lau</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tanno</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ktena</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Palepu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mustafa</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kornblith</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Fleet</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Prakash</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Virmani</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Green</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dominowska</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Arcas</surname>
              <given-names>BAY</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Florence</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Towards generalist biomedical AI</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.1056/aioa2300138</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Towards generic anomaly detection and understanding: large-scale visual-linguistic model (GPT-4V) takes the lead</article-title>
          <source>ArXiv. 2023;abs/2311.0</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2311.02782</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jianfeng Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>The dawn of LMMs: preliminary explorations with GPT-4V(ision)</article-title>
          <source>ArXiv. 2023;abs/2309.1</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2309.17421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cheung</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Summers</surname>
              <given-names>RM</given-names>
            </name>
            <name name-style="western">
              <surname>Rousseau</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Ni</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Landsman</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Baxter</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Al'Aref</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Brejt</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Chiang</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Hidden flaws behind expert-level accuracy of multimodal GPT-4 vision in medicine</article-title>
          <source>NPJ Digit Med</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>190</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01185-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01185-7</pub-id>
          <pub-id pub-id-type="medline">39043988</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01185-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC11266508</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>USMLE</collab>
          </person-group>
          <source>Sample Test Questions Step 1</source>
          <year>2023</year>
          <access-date>2024-12-17</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.usmle.org/sites/default/files/2021-10/Step_1_Sample_Items.pdf">https://www.usmle.org/sites/default/files/2021-10/Step_1_Sample_Items.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>AMBOSS</collab>
          </person-group>
          <source>AMBOSS Question difficulty 10/15/12023</source>
          <access-date>2021-06-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://support.amboss.com/hc/en-us/articles/360035679652-Question-difficulty">https://support.amboss.com/hc/en-us/articles/360035679652-Question-difficulty</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Herbert-Voss,</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krueger</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Henighan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ziegler</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winter</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hesse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sigler</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Litwin</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <source>arXiv:2005.14165</source>
          <year>2005</year>
          <pub-id pub-id-type="doi">10.5860/choice.189890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>American Board of Radiology</collab>
          </person-group>
          <source>Qualifying (Core) Exam</source>
          <access-date>2024-05-20</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.theabr.org/diagnostic-radiology/initial-certification/core-exam">https://www.theabr.org/diagnostic-radiology/initial-certification/core-exam</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pallais</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Fenves</surname>
              <given-names>AZ</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Glomski</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Case 18-2018: A 45-year-old woman with hypertension, fatigue, and altered mental status</article-title>
          <source>N Engl J Med</source>
          <year>2018</year>
          <volume>378</volume>
          <issue>24</issue>
          <fpage>2322</fpage>
          <lpage>2333</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmcpc1802825</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Petro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1233</fpage>
          <lpage>1239</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmsr2214184</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>OPENAI</collab>
          </person-group>
          <source>GPT-4V(ision) System Card</source>
          <access-date>2023-09-25</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://api.semanticscholar.org/CorpusID:263218031">https://api.semanticscholar.org/CorpusID:263218031</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jaakkola</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Rethinking cooperative rationalization: introspective extraction and complement control</article-title>
          <source>arXiv:1910.13294</source>
          <year>2019</year>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1420</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zaidan</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Eisner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Piatko</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Using ?Annotator Rationales? to improve machine learning for text categorization</article-title>
          <year>2007</year>
          <conf-name>The Conference of the North American Chapter of the Association for Computational Linguistics; Proceedings of the Main Conference</conf-name>
          <conf-date>2024 December 16</conf-date>
          <conf-loc>New York</conf-loc>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/N07-1033"/>
          </comment>
          <pub-id pub-id-type="doi">10.3115/1614164.1614179</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Frieske</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ishii</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bang</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Madotto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Survey of hallucination in natural language generation</article-title>
          <source>ACM Comput. Surv</source>
          <year>2023</year>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>1</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1145/3571730</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tinn</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Usuyama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Naumann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Poon</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Domain-Specific language model pretraining for biomedical natural language processing</article-title>
          <source>ACM Trans. Comput. Healthcare</source>
          <year>2021</year>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>1</fpage>
          <lpage>23</lpage>
          <pub-id pub-id-type="doi">10.1145/3458754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yao</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>JMLR: Joint Medical LLM and Retrieval Training for Enhancing Reasoning and Professional Question Answering Capability</article-title>
          <source>ArXiv. 2024;abs/2402.1</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2402.17887</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>180</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on a radiology board-style examination: insights into current strengths and limitations</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230582</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id>
          <pub-id pub-id-type="medline">37191485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States medical licensing examination (USMLE)? the implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sorin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Vaid</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Soroush</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Glicksberg</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Charney</surname>
              <given-names>AW</given-names>
            </name>
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Klang</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Comparing ChatGPT and GPT-4 performance in USMLE soft skill assessments</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>16492</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-43436-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-43436-9</pub-id>
          <pub-id pub-id-type="medline">37779171</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-43436-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC10543445</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nakao</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Miki</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nakamura</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Kikuchi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nomura</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hanaoka</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Yoshikawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Abe</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Capability of GPT-4V(ision) in the Japanese national medical licensing examination: evaluation study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e54393</fpage>
          <pub-id pub-id-type="doi">10.2196/54393</pub-id>
          <pub-id pub-id-type="medline">38470459</pub-id>
          <pub-id pub-id-type="pii">v10i1e54393</pub-id>
          <pub-id pub-id-type="pmcid">PMC10966435</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takagi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Koda</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Watari</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>The performance of ChatGPT-4V in interpreting images and tables in the Japanese medical licensing exam</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e54283</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e54283/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/54283</pub-id>
          <pub-id pub-id-type="medline">38787024</pub-id>
          <pub-id pub-id-type="pii">v10i1e54283</pub-id>
          <pub-id pub-id-type="pmcid">PMC11148840</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kennedy</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Kazam</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hentel</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Flanders</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shih</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Evaluating GPT-V4 (GPT-4 with Vision) on detection of radiologic findings on chest radiographs</article-title>
          <source>Radiology</source>
          <year>2024</year>
          <volume>311</volume>
          <issue>2</issue>
          <fpage>e233270</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.233270</pub-id>
          <pub-id pub-id-type="medline">38713028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sorin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Barash</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Konen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Glicksberg</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Nadkarni</surname>
              <given-names>GN</given-names>
            </name>
            <name name-style="western">
              <surname>Klang</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Assessing GPT-4 multimodal performance in radiological image analysis</article-title>
          <source>Eur Radiol</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.1007/s00330-024-11035-5</pub-id>
          <pub-id pub-id-type="medline">39214893</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-024-11035-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>The Federation of State Medical Boards (FSMB) and the National Board of Medical Examiners® (NBME®)</collab>
          </person-group>
          <source>Step 3 - United States Medical Licensing Examination</source>
          <access-date>2024-12-17</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.usmle.org/step-exams/step-3">https://www.usmle.org/step-exams/step-3</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elkassem</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>AD</given-names>
            </name>
          </person-group>
          <article-title>Potential use cases for ChatGPT in radiology reporting</article-title>
          <source>AJR Am J Roentgenol</source>
          <year>2023</year>
          <volume>221</volume>
          <issue>3</issue>
          <fpage>373</fpage>
          <lpage>376</lpage>
          <pub-id pub-id-type="doi">10.2214/AJR.23.29198</pub-id>
          <pub-id pub-id-type="medline">37095665</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirosawa</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Harada</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yokose</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sakamoto</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kawamura</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shimizu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic accuracy of differential-diagnosis lists generated by generative pretrained transformer 3 chatbot for clinical vignettes with common chief complaints: a pilot study</article-title>
          <source>Int J Environ Res Public Health</source>
          <year>2023</year>
          <volume>20</volume>
          <issue>4</issue>
          <fpage>3378</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=ijerph20043378"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/ijerph20043378</pub-id>
          <pub-id pub-id-type="medline">36834073</pub-id>
          <pub-id pub-id-type="pii">ijerph20043378</pub-id>
          <pub-id pub-id-type="pmcid">PMC9967747</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shea</surname>
              <given-names>YF</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>CMY</given-names>
            </name>
            <name name-style="western">
              <surname>Ip</surname>
              <given-names>WCT</given-names>
            </name>
            <name name-style="western">
              <surname>Luk</surname>
              <given-names>DWA</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>SSW</given-names>
            </name>
          </person-group>
          <article-title>Use of GPT-4 to analyze medical records of patients with extensive investigations and delayed diagnosis</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>8</issue>
          <fpage>e2325000</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37578798"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.25000</pub-id>
          <pub-id pub-id-type="medline">37578798</pub-id>
          <pub-id pub-id-type="pii">2808251</pub-id>
          <pub-id pub-id-type="pmcid">PMC10425828</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lalor</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Learning latent parameters without human response patterns: item response theory with artificial crowds</article-title>
          <source>arXiv:1908.11421</source>
          <year>2019</year>
          <pub-id pub-id-type="doi">10.18653/v1/d19-1434</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liberati</surname>
              <given-names>EG</given-names>
            </name>
            <name name-style="western">
              <surname>Ruggiero</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Galuppo</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gorli</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>González-Lorenzo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Maraldi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ruggieri</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Polo Friz</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Scaratti</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Kwag</surname>
              <given-names>KH</given-names>
            </name>
            <name name-style="western">
              <surname>Vespignani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Moja</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>What hinders the uptake of computerized decision support systems in hospitals? A qualitative study and framework for implementation</article-title>
          <source>Implement Sci</source>
          <year>2017</year>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>113</fpage>
          <pub-id pub-id-type="doi">10.1186/s13012-017-0644-2</pub-id>
          <pub-id pub-id-type="medline">28915822</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13012-017-0644-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC5602839</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Strohm</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hehakaya</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ranschaert</surname>
              <given-names>ER</given-names>
            </name>
            <name name-style="western">
              <surname>Boon</surname>
              <given-names>WPC</given-names>
            </name>
            <name name-style="western">
              <surname>Moors</surname>
              <given-names>EHM</given-names>
            </name>
          </person-group>
          <article-title>Implementation of artificial intelligence (AI) applications in radiology: hindering and facilitating factors</article-title>
          <source>Eur Radiol</source>
          <year>2020</year>
          <volume>30</volume>
          <issue>10</issue>
          <fpage>5525</fpage>
          <lpage>5532</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32458173"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00330-020-06946-y</pub-id>
          <pub-id pub-id-type="medline">32458173</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00330-020-06946-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC7476917</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Van Cauwenberge</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Van Biesen</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Decruyenaere</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Leune</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sterckx</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>"Many roads lead to Rome and the artificial Intelligence only shows me one road": an interview study on physician attitudes regarding the implementation of computerised clinical decision support systems</article-title>
          <source>BMC Med Ethics</source>
          <year>2022</year>
          <volume>23</volume>
          <issue>1</issue>
          <fpage>50</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedethics.biomedcentral.com/articles/10.1186/s12910-022-00787-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12910-022-00787-8</pub-id>
          <pub-id pub-id-type="medline">35524301</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12910-022-00787-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC9077861</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Thornton</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wyatt</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence and clinical decision support: clinicians' perspectives on trust, trustworthiness, and liability</article-title>
          <source>Med Law Rev</source>
          <year>2023</year>
          <volume>31</volume>
          <issue>4</issue>
          <fpage>501</fpage>
          <lpage>520</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37218368"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/medlaw/fwad013</pub-id>
          <pub-id pub-id-type="medline">37218368</pub-id>
          <pub-id pub-id-type="pii">7176027</pub-id>
          <pub-id pub-id-type="pmcid">PMC10681355</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>UB</given-names>
            </name>
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Kadoch</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Cham</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>Radiographic features of pulmonary embolism: hampton's hump</article-title>
          <source>Postgrad Med J</source>
          <year>2014</year>
          <volume>90</volume>
          <issue>1065</issue>
          <fpage>420</fpage>
          <lpage>421</lpage>
          <pub-id pub-id-type="doi">10.1136/postgradmedj-2013-132097</pub-id>
          <pub-id pub-id-type="medline">24894313</pub-id>
          <pub-id pub-id-type="pii">postgradmedj-2013-132097</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cano</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Romanou</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bonnet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Matoba</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Salvi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Pagliardini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Köpf</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mohtashami</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sallinen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sakhaeirad</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Swamy</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Krawczuk</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Bayazit</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Marmet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Montariol</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hartley</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Jaggi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bosselut</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>MEDITRON-70B: scaling medical pretraining for large language models</article-title>
          <source>arXiv:2311.16079</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.48550/arXiv.2311.16079</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Schramm</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Berberich</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenkranz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Schmitzer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Serguen</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Klenk</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lenhart</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmer</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wiestler</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Hedderich</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <source>Human-AI collaboration in large language model-assisted brain MRI differential diagnosis: a usability study</source>
          <year>2024</year>
          <access-date>2024-02-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2024.02.05.24302099v1">https://www.medrxiv.org/content/10.1101/2024.02.05.24302099v1</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>YY</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in radiology: evaluating proficiencies, addressing shortcomings, and proposing integrative approaches for the future</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>308</volume>
          <issue>1</issue>
          <fpage>e231335</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.231335</pub-id>
          <pub-id pub-id-type="medline">37432082</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>887</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11060887"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
          <pub-id pub-id-type="medline">36981544</pub-id>
          <pub-id pub-id-type="pii">healthcare11060887</pub-id>
          <pub-id pub-id-type="pmcid">PMC10048148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gandhi</surname>
              <given-names>TK</given-names>
            </name>
            <name name-style="western">
              <surname>Kachalia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Puopolo</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Yoon</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>TA</given-names>
            </name>
            <name name-style="western">
              <surname>Studdert</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Missed and delayed diagnoses in the ambulatory setting: a study of closed malpractice claims</article-title>
          <source>Ann Intern Med</source>
          <year>2006</year>
          <volume>145</volume>
          <issue>7</issue>
          <fpage>488</fpage>
          <lpage>496</lpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-145-7-200610030-00006</pub-id>
          <pub-id pub-id-type="medline">17015866</pub-id>
          <pub-id pub-id-type="pii">145/7/488</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berlin</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Radiologic errors, past, present and future</article-title>
          <source>Diagnosis (Berl)</source>
          <year>2014</year>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>79</fpage>
          <lpage>84</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.degruyter.com/document/doi/10.1515/dx-2013-0012"/>
          </comment>
          <pub-id pub-id-type="doi">10.1515/dx-2013-0012</pub-id>
          <pub-id pub-id-type="medline">29539959</pub-id>
          <pub-id pub-id-type="pii">/j/dx.2014.1.issue-1/dx-2013-0012/dx-2013-0012.xml</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
