<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v25i1e50865</article-id>
      <article-id pub-id-type="pmid">38133918</article-id>
      <article-id pub-id-type="doi">10.2196/50865</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Letter</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Research Letter</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluation of GPT-4’s Chest X-Ray Impression Generation: A Reader Study on Performance and Perception</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Toomey</surname>
            <given-names>Rachel</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Ziegelmayer</surname>
            <given-names>Sebastian</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Diagnostic and Interventional Radiology</institution>
            <institution>School of Medicine &#38; Klinikum rechts der Isar</institution>
            <institution>Technical University of Munich</institution>
            <addr-line>Ismaninger Straße 22</addr-line>
            <addr-line>Munich, 81675</addr-line>
            <country>Germany</country>
            <phone>49 1759153694</phone>
            <email>ga89rog@mytum.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-8724-4718</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Marka</surname>
            <given-names>Alexander W</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2111-8177</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Lenhart</surname>
            <given-names>Nicolas</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-4646-7532</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Nehls</surname>
            <given-names>Nadja</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6073-6464</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Reischl</surname>
            <given-names>Stefan</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7341-4296</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Harder</surname>
            <given-names>Felix</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3182-5924</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Sauter</surname>
            <given-names>Andreas</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4394-862X</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Makowski</surname>
            <given-names>Marcus</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7719-8236</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Graf</surname>
            <given-names>Markus</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4668-0326</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Gawlitza</surname>
            <given-names>Joshua</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9454-816X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Diagnostic and Interventional Radiology</institution>
        <institution>School of Medicine &#38; Klinikum rechts der Isar</institution>
        <institution>Technical University of Munich</institution>
        <addr-line>Munich</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Sebastian Ziegelmayer <email>ga89rog@mytum.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2023</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>22</day>
        <month>12</month>
        <year>2023</year>
      </pub-date>
      <volume>25</volume>
      <elocation-id>e50865</elocation-id>
      <history>
        <date date-type="received">
          <day>14</day>
          <month>7</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>15</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>16</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>27</day>
          <month>11</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Sebastian Ziegelmayer, Alexander W Marka, Nicolas Lenhart, Nadja Nehls, Stefan Reischl, Felix Harder, Andreas Sauter, Marcus Makowski, Markus Graf, Joshua Gawlitza. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 22.12.2023.</copyright-statement>
      <copyright-year>2023</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2023/1/e50865" xlink:type="simple"/>
      <abstract>
        <p>Exploring the generative capabilities of the multimodal GPT-4, our study uncovered significant differences between radiological assessments and automatic evaluation metrics for chest x-ray impression generation and revealed radiological bias.</p>
      </abstract>
      <kwd-group>
        <kwd>generative model</kwd>
        <kwd>GPT</kwd>
        <kwd>medical imaging</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>imaging</kwd>
        <kwd>radiology</kwd>
        <kwd>radiological</kwd>
        <kwd>radiography</kwd>
        <kwd>diagnostic</kwd>
        <kwd>chest</kwd>
        <kwd>x-ray</kwd>
        <kwd>x-rays</kwd>
        <kwd>generative</kwd>
        <kwd>multimodal</kwd>
        <kwd>impression</kwd>
        <kwd>impressions</kwd>
        <kwd>image</kwd>
        <kwd>images</kwd>
        <kwd>AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Generative models trained on large-scale data sets have demonstrated an unprecedented ability to generate humanlike text [<xref ref-type="bibr" rid="ref1">1</xref>] and have performed surprisingly well on untrained tasks (zero-shot learning) [<xref ref-type="bibr" rid="ref2">2</xref>]. In medical imaging, the applications are manifold, and it has been shown that models can not only draw radiological conclusions [<xref ref-type="bibr" rid="ref3">3</xref>] but also structure reports [<xref ref-type="bibr" rid="ref4">4</xref>] and even generate impressions based on the findings given in a report [<xref ref-type="bibr" rid="ref5">5</xref>] or the image itself [<xref ref-type="bibr" rid="ref6">6</xref>]. One of the leading obstacles limiting the development of models for generating clinically applicable reports is the lack of evaluation metrics that capture the core aspects of radiological impressions [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. While there are initial studies on the perception of artificial intelligence (AI)–generated text in the general population [<xref ref-type="bibr" rid="ref9">9</xref>], insights are missing for specialized areas such as medical imaging. Therefore, our study investigated the ability of GPT-4 to generate radiological impressions based on different inputs, focusing on the correlation between radiological assessment of impression quality and common automated evaluation metrics, as well as radiological perception of AI-generated text.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>To generate and evaluate impressions of chest x-rays based on different input modalities (image, text, text and image), a blinded radiological report was written for 25 cases from a publicly available National Institutes of Health data set [<xref ref-type="bibr" rid="ref10">10</xref>]. The GPT-4 model was given an image, the results, or both sequentially to generate an input-dependent impression. In a blind randomized reading, 4 radiologists rated the impressions based on “coherence,” “factual consistency,” “comprehensiveness,” and “medical harmfulness,” which were used to generate a radiological score based on a 5-point Likert scale of each dimension. Additionally, radiologists were asked to classify the origin of the impression (human, AI), providing justification for their decision. The text model evaluation metrics and their correlation with the radiological score were assessed. Lastly, common model metrics for text evaluation were extracted and compared to the radiological assessment. The supplementary methods in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref17">17</xref>] provide further details.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>Due to the publicly available data set used in this study, the requirement to obtain written informed consent from the participants was waived. Participants were anonymized.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>According to the radiological score, the human-written impression was rated highest, although not significantly higher than the text-based impressions (<xref ref-type="table" rid="table1">Table 1</xref>). A detailed analysis is shown in the supplementary results section in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The automated evaluation metrics showed moderate correlations to the radiological score for the image impressions; however, individual scores diverged depending on the input (<xref rid="figure1" ref-type="fig">Figure 1</xref>). Correct detection of an impression’s origin (human/AI) varied by input (text: 61/100, 61%; image: 87/100, 87%; radiologist: 87/100, 87%; text and image: 63/100, 63%). For the text input, a homogeneous distribution was found, similar to radiological impressions classified as AI generated (supplementary figure in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). It was shown that impressions classified as human written were rated significantly higher by the radiologist, with a mean score of 18.11 (SD 1.87) for impressions classified as human written and 13.41 (SD 3.93; <italic>P</italic>≤.001) for impressions classified as AI generated.</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Quantitative and qualitative scores based on the input<sup>a</sup>.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="160"/>
          <col width="160"/>
          <col width="120"/>
          <col width="120"/>
          <col width="200"/>
          <col width="120"/>
          <col width="120"/>
          <thead>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Qualitative</td>
              <td colspan="3">Quantitative</td>
            </tr>
            <tr valign="bottom">
              <td>
                <break/>
              </td>
              <td>Radiologist score</td>
              <td>BLEU<sup>b</sup></td>
              <td>BERT<sup>c</sup></td>
              <td>CheXbert vector similarity</td>
              <td>RadGraph</td>
              <td>RadCliQ</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td>Image</td>
              <td>10.97<sup>d</sup></td>
              <td>0.051<sup>e</sup></td>
              <td>0.298<sup>e</sup></td>
              <td>0.471</td>
              <td>0.038<sup>d</sup></td>
              <td>0.328<sup>d</sup></td>
            </tr>
            <tr valign="top">
              <td>Text</td>
              <td>16.95</td>
              <td>0.125</td>
              <td>0.356</td>
              <td>0.417</td>
              <td>0.168</td>
              <td>0.291</td>
            </tr>
            <tr valign="top">
              <td>Text and image</td>
              <td>15.54<sup>d</sup></td>
              <td>0.173</td>
              <td>0.411</td>
              <td>0.523</td>
              <td>0.197</td>
              <td>0.278</td>
            </tr>
            <tr valign="top">
              <td>Radiologist</td>
              <td>18.47</td>
              <td>N/A<sup>f</sup></td>
              <td>N/A</td>
              <td>N/A</td>
              <td>N/A</td>
              <td>N/A</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>Except for RadCiQ, which corresponds to the error rate, a higher score indicates a better approximation. For the automated metrics, the text and image–based impression score was highest, while the radiological score for the text-based impression was closest to the radiological ground truth.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>BLEU: bilingual evaluation understudy.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>BERT: Bidirectional Encoder Representations From Transformers.</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>Indicates a <italic>P</italic> value &#60;.05 for all higher input scores.</p>
          </fn>
          <fn id="table1fn5">
            <p><sup>e</sup>Indicates a <italic>P</italic> value &#60;.05 compared to the highest score.</p>
          </fn>
          <fn id="table1fn6">
            <p><sup>f</sup>N/A: not applicable.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Scatterplots for each automated metric (BERT=blue; BLEU=yellow; CheXbert vector similarity=gray; RadGraph=light blue; RadCliQ=red) depending on the input: (A) image, (B) text, or (C) text and image. For the image input, all metrics except CheXbert vector similarity showed a significant correlation. However, the correlation was divergent or opposing for the text and text and image inputs. All correlation coefficients with their <italic>P</italic> values are shown in the lower section of the figure. BERT: Bidirectional Encoder Representations From Transformers; BLEU: bilingual evaluation understudy.</p>
        </caption>
        <graphic xlink:href="jmir_v25i1e50865_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>We evaluated the “out-of-the-box” performance of GPT-4 for chest x-ray impression generation based on different inputs. Based on the radiological score, text-based impressions were not significantly lower than the radiological impressions, whereas other inputs were rated significantly lower. Sun et al [<xref ref-type="bibr" rid="ref5">5</xref>] showed that text-based impressions rated by radiologists were inferior. However, the study did not clarify if the radiological evaluations of the impressions were conducted under blinded conditions. Our work identified radiological bias, as impressions classified as human written received higher ratings. Therefore, without blinding, there is a risk that the inferiority of the AI-generated impressions is due to bias.</p>
      <p>For the automated metrics, the impressions based on text and image were rated the closest to the radiological impressions, followed by text-based impressions. For the image-based impressions, there was a significant moderate correlation between the automated metrics and the radiological score; however, for the other inputs, opposite or nonsignificant correlations were found. Automatic metrics that capture relevant aspects of report quality are a prerequisite for successful development and clinical integration. Evaluation metrics, however, can only be as good as the human assessment, which is not free of bias and characterized by false heuristics [<xref ref-type="bibr" rid="ref9">9</xref>]. Our findings underline this point, as impressions that were classified as human written scored significantly higher in the radiological assessment. Human evaluation is not error-free, but it is the benchmark for the evaluation of generated text.</p>
      <p>Radiological heuristics, sources of error, and relevant aspects of radiological quality need to be further investigated, as they are essential for the development of useful model metrics.</p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Detailed methods, detailed results, and a mosaic plot visualizing the justification for classifying an impression as artificial intelligence generated.</p>
        <media xlink:href="jmir_v25i1e50865_app1.docx" xlink:title="DOCX File , 5377 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>No generative model was used to write, edit, or review the manuscript.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated and analyzed during this study are available from the corresponding author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 28, 2020. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/pdf/2005.14165.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Hallacy</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Learning transferable visual models from natural language supervision</article-title>
          <year>2021</year>
          <conf-name>38th International Conference on Machine Learning</conf-name>
          <conf-date>July 18-24, 2021</conf-date>
          <conf-loc>Virtual</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 in radiology: improvements in advanced reasoning</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>06</month>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230987</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230987</pub-id>
          <pub-id pub-id-type="medline">37191491</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Busch</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kader</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Niehues</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Makowski</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Bressem</surname>
              <given-names>KK</given-names>
            </name>
          </person-group>
          <article-title>Leveraging GPT-4 for post hoc transformation of free-text radiology reports into structured reporting: a multilingual feasibility study</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>05</month>
          <volume>307</volume>
          <issue>4</issue>
          <fpage>e230725</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230725</pub-id>
          <pub-id pub-id-type="medline">37014240</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kennedy</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Elias</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lucas</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Shih</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Evaluating GPT4 on impressions generation in radiology reports</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <month>06</month>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e231259</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37367439"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/radiol.231259</pub-id>
          <pub-id pub-id-type="medline">37367439</pub-id>
          <pub-id pub-id-type="pmcid">PMC10534271</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Endo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Krishnan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-based chest x-ray report generation using a pre-trained contrastive language-image model</article-title>
          <year>2021</year>
          <conf-name>Machine Learning for Health</conf-name>
          <conf-date>December 4, 2021</conf-date>
          <conf-loc>Virtual</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hartung</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bickle</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Gaillard</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Kanne</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>How to create a great radiology report</article-title>
          <source>Radiographics</source>
          <year>2020</year>
          <month>10</month>
          <volume>40</volume>
          <issue>6</issue>
          <fpage>1658</fpage>
          <lpage>1670</lpage>
          <pub-id pub-id-type="doi">10.1148/rg.2020200020</pub-id>
          <pub-id pub-id-type="medline">33001790</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Endo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Krishnan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Pan</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Tsai</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Reis</surname>
              <given-names>EP</given-names>
            </name>
            <name name-style="western">
              <surname>Fonseca</surname>
              <given-names>EKUN</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>HMH</given-names>
            </name>
            <name name-style="western">
              <surname>Abad</surname>
              <given-names>ZSH</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>AY</given-names>
            </name>
            <name name-style="western">
              <surname>Langlotz</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Venugopal</surname>
              <given-names>VK</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Evaluating progress in automatic chest X-ray radiology report generation</article-title>
          <source>Patterns (N Y)</source>
          <year>2023</year>
          <month>09</month>
          <day>08</day>
          <volume>4</volume>
          <issue>9</issue>
          <fpage>100802</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-3899(23)00157-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.patter.2023.100802</pub-id>
          <pub-id pub-id-type="medline">37720336</pub-id>
          <pub-id pub-id-type="pii">S2666-3899(23)00157-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10499844</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jakesch</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hancock</surname>
              <given-names>JT</given-names>
            </name>
            <name name-style="western">
              <surname>Naaman</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Human heuristics for AI-generated language are flawed</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2023</year>
          <month>03</month>
          <day>14</day>
          <volume>120</volume>
          <issue>11</issue>
          <fpage>e2208839120</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36881628"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.2208839120</pub-id>
          <pub-id pub-id-type="medline">36881628</pub-id>
          <pub-id pub-id-type="pmcid">PMC10089155</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Bagheri</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Summers</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>ChestX-ray8: hospital-scale chest x-ray database and benchmarks on weakly-supervised classification and localization of common thorax diseases</article-title>
          <source>Proc IEEE Conference Computer Vis Pattern Recognition</source>
          <year>2017</year>
          <fpage>2097</fpage>
          <lpage>2106</lpage>
          <pub-id pub-id-type="doi">10.1109/cvpr.2017.369</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Piktus</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Petroni</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Karpukhin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Küttler</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title>
          <source>arXiv. Preprint posted online on May 22, 2020</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bosma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ichter</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>
          <source>arXiv. Preprint posted online on January 28, 2022</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kojima</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Reid</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Matsuo</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Iwasawa</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Large language models are zero-shot reasoners</article-title>
          <source>arXiv. Preprint posted online on May 24, 2022</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Papineni</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Roukos</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ward</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>WJ</given-names>
            </name>
          </person-group>
          <article-title>BLEU: a method for automatic evaluation of machine translation</article-title>
          <year>2002</year>
          <conf-name>40th Annual Meeting of the Association for Computational Linguistics</conf-name>
          <conf-date>July 2002</conf-date>
          <conf-loc>Philadelphia, PA</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kishore</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>KQ</given-names>
            </name>
            <name name-style="western">
              <surname>Artzi</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>BERTScore: evaluating text generation with BERT</article-title>
          <source>arXiv. Preprint posted online on April 21, 2019</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smit</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rajpurkar</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pareek</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lungren</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Combining automatic labelers and expert annotations for accurate radiology report labeling using BERT</article-title>
          <year>2020</year>
          <conf-name>Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>November 2020</conf-date>
          <conf-loc>Online</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agrawal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Saporta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Truong</surname>
              <given-names>SQH</given-names>
            </name>
            <name name-style="western">
              <surname>Duong</surname>
              <given-names>DN</given-names>
            </name>
            <name name-style="western">
              <surname>Bui</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chambon</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>RadGraph: extracting clinical entities and relations from radiology reports</article-title>
          <source>arXiv. Preprint posted online on June 28, 2021</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
