<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="letter" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e53724</article-id>
      <article-id pub-id-type="pmid">38739441</article-id>
      <article-id pub-id-type="doi">10.2196/53724</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Letter</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Research Letter</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluating the Diagnostic Performance of Large Language Models on Complex Multimodal Medical Cases</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>de Azevedo Cardoso</surname>
            <given-names>Taiane</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yang</surname>
            <given-names>Rui</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Lingxuan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mondal</surname>
            <given-names>Himel</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Chiu</surname>
            <given-names>Wan Hang Keith</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7930-1193</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Ko</surname>
            <given-names>Wei Sum Koel</given-names>
          </name>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5656-1822</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Cho</surname>
            <given-names>William Chi Shing</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4174-4586</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Hui</surname>
            <given-names>Sin Yu Joanne</given-names>
          </name>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-9309-2423</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Chan</surname>
            <given-names>Wing Chi Lawrence</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6451-2273</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Kuo</surname>
            <given-names>Michael D</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <address>
            <institution>Ensemble Group</institution>
            <addr-line>10541 E Firewheel Drive</addr-line>
            <addr-line>Scottsdale, AZ, 85259</addr-line>
            <country>United States</country>
            <phone>1 4084512341</phone>
            <email>mikedkuo@gmail.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4747-1611</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Diagnostic and Interventional Radiology</institution>
        <institution>Queen Elizabeth Hospital</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China (Hong Kong)</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Clinical Oncology</institution>
        <institution>Queen Elizabeth Hospital</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China (Hong Kong)</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>School of Biomedical Sciences</institution>
        <institution>Li Ka Shing Faculty of Medicine</institution>
        <institution>The University of Hong Kong</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China (Hong Kong)</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Health Technology and Informatics</institution>
        <institution>The Hong Kong Polytechnic University</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China (Hong Kong)</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Ensemble Group</institution>
        <addr-line>Scottsdale, AZ</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Michael D Kuo <email>mikedkuo@gmail.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>5</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e53724</elocation-id>
      <history>
        <date date-type="received">
          <day>17</day>
          <month>10</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>5</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>22</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>23</day>
          <month>4</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Wan Hang Keith Chiu, Wei Sum Koel Ko, William Chi Shing Cho, Sin Yu Joanne Hui, Wing Chi Lawrence Chan, Michael D Kuo. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 13.05.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e53724" xlink:type="simple"/>
      <abstract>
        <p>Large language models showed interpretative reasoning in solving diagnostically challenging medical cases.</p>
      </abstract>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>hospital</kwd>
        <kwd>health center</kwd>
        <kwd>Massachusetts</kwd>
        <kwd>statistical analysis</kwd>
        <kwd>chi-square</kwd>
        <kwd>ANOVA</kwd>
        <kwd>clinician</kwd>
        <kwd>physician</kwd>
        <kwd>performance</kwd>
        <kwd>proficiency</kwd>
        <kwd>disease etiology</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Large language models (LLMs) have demonstrated a surprising performance in radiological examinations [<xref ref-type="bibr" rid="ref1">1</xref>]. However, their proficiency in real-world medical reasoning, especially when integrating multimodal data remains uncertain [<xref ref-type="bibr" rid="ref2">2</xref>]. This study evaluates the ability of 3 commonly used LLMs—Google Bard (subsequently rebranded Gemini), Claude 2, and GPT-4—to generate differential diagnoses (ddx) from complex multimodality diagnostic cases.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>Consecutive case records of the Massachusetts General Hospital from July 2020 to June 2023 were selected [<xref ref-type="bibr" rid="ref3">3</xref>]. The cases were diagnostically challenging, but a final diagnosis was provided. Only the case presentation and a simple prompt asking for the top 5 ddx were used as input. Each case was run independently to prevent the model from being influenced by prior cases. To evaluate the stability of the results, all cases were reinputted into each LLM. To enable objective assessment, all diagnoses were mapped to their corresponding <italic>International Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>) codes, with higher-level codes used in case an exact code could not be assigned (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <p>The primary objective was accuracy, measured by whether the final diagnosis was within the LLM-generated ddx at the <italic>ICD-10</italic> category level. The secondary objectives were to measure the similarity between diagnoses within the ddx and the final diagnosis as well as their similarity to each other, measured at the <italic>ICD-10</italic> chapter level. Chi-square and ANOVA tests were used to compare categorical data between the LLMs. Statistical analyses were performed using Prism 10 (GraphPad Software).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>(A) Standardized prompt used for each case to generate differential diagnoses (ddx). (B) An example of <italic>International Classification of Diseases, Tenth Revision</italic> (<italic>ICD-10</italic>) code hierarchy structure; the first character (an alphabetical letter) denotes the chapter, and when combined with the next 2 digits, it forms the <italic>ICD-10</italic> category code. (C) An example of a large language model (LLM)–generated ddx and the corresponding <italic>ICD-10</italic> codes (case 34); in this case, none of the 3 LLMs included the final diagnosis (high-grade B-cell lymphoma, not otherwise specified; C83.30) in their ddx. For Bard, 3 of the 5 ddx belonged to the same chapter as the final diagnosis (chapter II: C22.0, C85.9, and C79.9). For Claude 2, only 1 of the 5 ddx belonged to the same chapter as the final diagnosis (chapter II: C85.9). For GPT-4, only 1 of the 5 ddx belonged to the same chapter as the final diagnosis (chapter II: C79.9).</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e53724_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethics Approval</title>
        <p>Approval from an institutional review board was not required due to the use of publicly available nonidentifiable data.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The diagnostic accuracy on 104 evaluated cases based on the first set of answers by the LLMs was 27.9% for Bard, 30.8% for Claude 2, and 31.7% for GPT-4. Accuracy significantly improved at the <italic>ICD-10</italic> chapter (body site or system) level, reaching 65.4% for Bard, 66.3% for Claude 2, and 71.2% for GPT-4. The mean number of the same ddx generated in each case in the repeatability testing was 2.3 (SD 1.1) for Bard, 2.4 (SD 1.2) for Claude 2, and 2.4 (SD 1.2) for GPT-4.</p>
      <p>All 3 LLMs showed evidence of interpretive reasoning, as they tended to generate sets of ddx whose member diagnoses were often related to each other. The mean number of ddx per case belonging to the same <italic>ICD-10</italic> chapter as each other was 2.6 (SD 1.1) for Bard, 2.7 (SD 1.1) for Claude 2, and 2.4 (SD 0.9) for GPT-4. Interestingly, these related diagnosis “clusters” were often unrelated to the final diagnosis. The mean number of ddx belonging to the same <italic>ICD-10</italic> chapter as the final diagnosis was 1.2 (SD 1.3) for Bard, 1.4 (SD 1.4) for Claude 2, and 1.4 (SD 1.2) for GPT-4. These two findings were irrespective of whether the LLMs could include the final diagnosis in their ddx. Furthermore, the performance of the LLMs varied by disease etiology, although this difference was not statistically significant (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Performance of individual large language models (LLMs).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="520"/>
          <col width="110"/>
          <col width="110"/>
          <col width="110"/>
          <col width="0"/>
          <col width="120"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Characteristics</td>
              <td>Bard</td>
              <td>Claude 2</td>
              <td>GPT4</td>
              <td colspan="2"><italic>P</italic> value</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="6">
                <bold>Accuracy by <italic>ICD-10</italic><sup>a</sup> hierarchy level, %</bold>
              </td>
              <td>
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Category</td>
              <td>27.9</td>
              <td>30.7</td>
              <td>30.7</td>
              <td colspan="2">&lt;.001<sup>b</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Chapter</td>
              <td>65.4</td>
              <td>66.3</td>
              <td>71.2</td>
              <td colspan="2">&lt;.001<sup>b</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="6">
                <bold>Accuracy by <italic>ICD-10</italic> etiology (top 5 by frequency), n (%)</bold>
              </td>
              <td>
                <break/>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Certain infectious and parasitic diseases (chapter I: A00-B99)</td>
              <td>20 (35.0)</td>
              <td>45.0</td>
              <td>50.0</td>
              <td colspan="2">.62<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Neoplasm (chapter II C00-D48)</td>
              <td>19 (52.6)</td>
              <td>63.2</td>
              <td>57.9</td>
              <td colspan="2">.75<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Diseases of the blood and blood-forming organs and certain disorders involving the immune mechanism (chapter III: D50-D89)</td>
              <td>8 (12.5)</td>
              <td>25.0</td>
              <td>12.5</td>
              <td colspan="2">.74<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Endocrine, nutritional, and metabolic diseases (chapter IV: E00-E90)</td>
              <td>9 (33.3)</td>
              <td>33.3</td>
              <td>33.3</td>
              <td colspan="2">&gt;.99<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Diseases of the musculoskeletal system and connective tissue (chapter XIII: M00-M99)</td>
              <td>11 (36.4)</td>
              <td>72.7</td>
              <td>63.6</td>
              <td colspan="2">.20<sup>c</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="2">Number of diagnoses per ddx<sup>d</sup> per case generated by LLMs belonging to the same hierarchical chapter as the final diagnosis based on assigned <italic>ICD-10</italic> codes, mean (SD)</td>
              <td>1.2 (1.3)</td>
              <td>1.4 (1.4)</td>
              <td>1.4 (1.2)</td>
              <td colspan="2">—<sup>e</sup></td>
            </tr>
            <tr valign="top">
              <td colspan="2">Number of diagnoses per ddx per case generated by LLMs belonging to the same hierarchical chapter based on assigned <italic>ICD-10</italic> codes, mean (SD)</td>
              <td>2.6 (1.1)</td>
              <td>2.7 (1.1)</td>
              <td>2.4 (0.9)</td>
              <td colspan="2">—</td>
            </tr>
            <tr valign="top">
              <td colspan="2">Number of the same ddx per case generated by LLMs on repeatability testing, mean (SD)</td>
              <td>2.3 (1.1)</td>
              <td>2.4 (1.2)</td>
              <td>2.4 (1.2)</td>
              <td colspan="2">—</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup><italic>ICD-10</italic>: <italic>International Classification of Diseases, Tenth Revision</italic>.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>Comparison of each LLM’s performance at the <italic>ICD-10</italic> category level versus the chapter level.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>Comparison of each LLM’s performance across different <italic>ICD-10</italic> etiologies. <italic>P</italic> values were not significant.</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>ddx: differential diagnoses.</p>
          </fn>
          <fn id="table1fn5">
            <p><sup>e</sup>Not applicable.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>This study rigorously evaluated the diagnostic capacity of multiple LLMs using a simple standardized prompt [<xref ref-type="bibr" rid="ref4">4</xref>]. The 3 LLMs represent state-of-the-art, general LLMs accessible to most clinicians. The relatively low accuracy of all 3 models at the <italic>ICD-10</italic> category level, coupled with a mean of &gt;3 out of 5 diagnoses located in a chapter outside the final diagnosis chapter, collectively suggest either a knowledge or reasoning gap in current LLMs. Although performance differences are observed between different types of disease etiology (eg, 12.5% for Chapter III vs 63.6% for Chapter XIII in GPT4), the small numbers and unequal distribution of etiologies preclude adequate analysis; however, this area warrants further investigation. Conversely, the moderate number of LLM-generated ddx belonging to the same body site or system (chapter) implies these models can integrate and reason across complex clinical findings.</p>
      <p>This study has limitations, including the low reproducibility of the ddx generated by the LLMs. The generative nature of these models and their continuous updates may lead to performance drifts and contradictory results. Further research and validation are necessary to generate consistent and explainable results as well as explore the relationships between performance and repeatability. Second, we did not assess whether human-artificial intelligence interaction or prompt engineering would affect diagnostic accuracy. Nevertheless, attempts to “overengineer” general LLMs toward a desired output could cloud real-world applicability, detracting from the ease of use that makes current LLMs attractive to general users [<xref ref-type="bibr" rid="ref5">5</xref>]. Future work includes analyzing the rationales provided by the LLMs in reaching their ddx and asking the LLMs to quantify the likelihood of each ddx. Finally, the diversity of LLM-generated ddx warrants further exploration, as it could potentially hamper patient management [<xref ref-type="bibr" rid="ref6">6</xref>].</p>
      <p>In conclusion, LLMs may have a role in enhancing physician diagnosis of complex, multimodal clinical cases when applied judiciously.</p>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">Ddx</term>
          <def>
            <p>differential diagnoses</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">
            <italic>ICD-10</italic>
          </term>
          <def>
            <p>
              <italic>International Classification of Diseases, Tenth Revision</italic>
            </p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on a radiology board-style examination: insights into current strengths and limitations</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>06</month>
          <day>01</day>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230582</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230582</pub-id>
          <pub-id pub-id-type="medline">37191485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jamshidi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Feizi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sirlin</surname>
              <given-names>CB</given-names>
            </name>
            <name name-style="western">
              <surname>Lavine</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Kuo</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>Multi-modality, multi-dimensional characterization of pediatric non-alcoholic fatty liver disease</article-title>
          <source>Metabolites</source>
          <year>2023</year>
          <month>08</month>
          <day>08</day>
          <volume>13</volume>
          <issue>8</issue>
          <fpage>929</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=metabo13080929"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/metabo13080929</pub-id>
          <pub-id pub-id-type="medline">37623872</pub-id>
          <pub-id pub-id-type="pii">metabo13080929</pub-id>
          <pub-id pub-id-type="pmcid">PMC10456937</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dougan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Abramson</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Fitzpatrick</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>Case 14-2022: a 57-year-old man with chylous ascites</article-title>
          <source>N Engl J Med</source>
          <year>2022</year>
          <month>05</month>
          <day>12</day>
          <volume>386</volume>
          <issue>19</issue>
          <fpage>1834</fpage>
          <lpage>1844</lpage>
          <pub-id pub-id-type="doi">10.1056/nejmcpc2115856</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kanjee</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Crowe</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rodman</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <month>07</month>
          <day>03</day>
          <volume>330</volume>
          <issue>1</issue>
          <fpage>78</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37318797"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id>
          <pub-id pub-id-type="medline">37318797</pub-id>
          <pub-id pub-id-type="pii">2806457</pub-id>
          <pub-id pub-id-type="pmcid">PMC10273128</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Bischoff</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Moll</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kroschke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dulz</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Heußel</surname>
              <given-names>Claus Peter</given-names>
            </name>
            <name name-style="western">
              <surname>Kauczor</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Weber</surname>
              <given-names>TF</given-names>
            </name>
          </person-group>
          <article-title>Potential of ChatGPT and GPT-4 for data mining of free-text CT reports on lung cancer</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>09</month>
          <volume>308</volume>
          <issue>3</issue>
          <fpage>e231362</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.231362</pub-id>
          <pub-id pub-id-type="medline">37724963</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kamineni</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lie</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Prasad</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Landman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dreyer</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Succi</surname>
              <given-names>MD</given-names>
            </name>
          </person-group>
          <article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <month>08</month>
          <day>22</day>
          <volume>25</volume>
          <fpage>e48659</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e48659/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48659</pub-id>
          <pub-id pub-id-type="medline">37606976</pub-id>
          <pub-id pub-id-type="pii">v25i1e48659</pub-id>
          <pub-id pub-id-type="pmcid">PMC10481210</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
