<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e57257</article-id>
      <article-id pub-id-type="pmid">40080818</article-id>
      <article-id pub-id-type="doi">10.2196/57257</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Assessing Racial and Ethnic Bias in Text Generation by Large Language Models for Health Care–Related Tasks: Cross-Sectional Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>de Azevedo Cardoso</surname>
            <given-names>Taiane</given-names>
          </name>
        </contrib>
        <contrib contrib-type="editor">
          <name>
            <surname>Su</surname>
            <given-names>Zhaohui</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Yang</surname>
            <given-names>Rui</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ma</surname>
            <given-names>Tianyu</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Estevão</surname>
            <given-names>Maria Dulce </given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Hanna</surname>
            <given-names>John J</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Information Services</institution>
            <institution>ECU Health</institution>
            <addr-line>2100 Stantonsburg Rd</addr-line>
            <addr-line>Greenville, NC, 27834</addr-line>
            <country>United States</country>
            <fax>1 2528160081</fax>
            <phone>1 2528474100</phone>
            <email>john.hanna@ecuhealth.org</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0909-9396</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Wakene</surname>
            <given-names>Abdi D</given-names>
          </name>
          <degrees>BMSc</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-7270-0506</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Johnson</surname>
            <given-names>Andrew O</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-0513-0778</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lehmann</surname>
            <given-names>Christoph U</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9559-4646</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Medford</surname>
            <given-names>Richard J</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9814-8043</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Information Services</institution>
        <institution>ECU Health</institution>
        <addr-line>Greenville, NC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Division of Infectious Diseases</institution>
        <institution>Department of Internal Medicine</institution>
        <institution>East Carolina University</institution>
        <addr-line>Greenville, NC</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Clinical Informatics Center</institution>
        <institution>University of Texas Southwestern</institution>
        <addr-line>Dallas, TX</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Pediatrics</institution>
        <institution>University of Texas Southwestern</institution>
        <institution>The University of Texas Southwestern Medical Center</institution>
        <addr-line>Dallas, TX</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: John J Hanna <email>john.hanna@ecuhealth.org</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>13</day>
        <month>3</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e57257</elocation-id>
      <history>
        <date date-type="received">
          <day>9</day>
          <month>2</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>10</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>29</day>
          <month>4</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>16</day>
          <month>1</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©John J Hanna, Abdi D Wakene, Andrew O Johnson, Christoph U Lehmann, Richard J Medford. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 13.03.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e57257" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Racial and ethnic bias in large language models (LLMs) used for health care tasks is a growing concern, as it may contribute to health disparities. In response, LLM operators implemented safeguards against prompts that are overtly seeking certain biases.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to investigate a potential racial and ethnic bias among 4 popular LLMs: GPT-3.5-turbo (OpenAI), GPT-4 (OpenAI), Gemini-1.0-pro (Google), and Llama3-70b (Meta) in generating health care consumer–directed text in the absence of overtly biased queries.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>In this cross-sectional study, the 4 LLMs were prompted to generate discharge instructions for patients with HIV. Each patient’s encounter deidentified metadata including race/ethnicity as a variable was passed over in a table format through a prompt 4 times, altering only the race/ethnicity information (African American, Asian, Hispanic White, and non-Hispanic White) each time, while keeping all other information constant. The prompt requested the model to write discharge instructions for each encounter without explicitly mentioning race or ethnicity. The LLM-generated instructions were analyzed for sentiment, subjectivity, reading ease, and word frequency by race/ethnicity.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The only observed statistically significant difference between race/ethnicity groups was found in entity count (GPT-4, df=42, <italic>P</italic>=.047). However, post hoc chi-square analysis for GPT-4’s entity counts showed no significant pairwise differences among race/ethnicity categories after Bonferroni correction.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>A total of 4 LLMs were relatively invariant to race/ethnicity in terms of linguistic and readability measures. While our study used proxy linguistic and readability measures to investigate racial and ethnic bias among 4 LLM responses in a health care–related task, there is an urgent need to establish universally accepted standards for measuring bias in LLM-generated responses. Further studies are needed to validate these results and assess their implications.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>sentiment analysis</kwd>
        <kwd>racism</kwd>
        <kwd>bias</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>reading ease</kwd>
        <kwd>word frequency</kwd>
        <kwd>large language models</kwd>
        <kwd>text generation</kwd>
        <kwd>healthcare</kwd>
        <kwd>task</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>cross sectional</kwd>
        <kwd>consumer-directed</kwd>
        <kwd>human immunodeficiency virus</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Large language models (LLMs), which are a subset of artificial intelligence technologies that process and generate text similar to human-generated text based on patterns and information learned from vast datasets, have generated anticipation and trepidation regarding their use in medicine and health care [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. LLMs are developed by training on extensive text-based datasets, enabling them to learn and predict word associations and contextually appropriate language use, [<xref ref-type="bibr" rid="ref3">3</xref>] and then apply the learned configurations of word combinations to natural language processing (NLP) assignments. Their potential application in medicine and health care is promising, as they can encode clinical knowledge and generate text responses for various health care tasks [<xref ref-type="bibr" rid="ref4">4</xref>].</p>
      <p>LLMs are designed to generate responses that closely resemble language, which allows individuals and businesses to use it for many text-based tasks. However, upon early release to the public, researchers raised concerns that biases found in human-generated text may be transferred and augmented in LLMs resulting in biased system responses, particularly on topics like gender and race [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. While “racial and ethnic bias” refers to prejudices or unfair differences in treatment or representation based on an individual’s race or ethnicity, conceptually, it encompasses stereotypes, prejudices, and discriminatory behaviors that disadvantage certain racial or ethnic groups. Operationally, in the context of LLMs, it can be measured by analyzing the differences in responses generated by LLMs when prompts include varying racial or ethnic identifiers.</p>
      <p>As a result of researchers detecting bias with targeted questions, developers of LLMs have restricted users from asking questions that demonstrate ingrained bias in an obvious fashion like “Create a table to display 10 words associated with Caucasians and 10 with Blacks in terms of occupations and intelligence.” While developers of LLMs have implemented these safeguards, the possibility of subtle biases persists. In response, our study aims to investigate the potential presence of racial bias among responses from 4 popular LLMs (OpenAI’s GPT-3.5-turbo [<xref ref-type="bibr" rid="ref7">7</xref>], OpenAI’s GPT-4 [<xref ref-type="bibr" rid="ref7">7</xref>], Google’s Gemini-1.0-pro [<xref ref-type="bibr" rid="ref8">8</xref>], and Meta’s Llama3-70b [<xref ref-type="bibr" rid="ref9">9</xref>]) to ordinary health care tasks that do not explicitly mention race.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Study Design</title>
        <p>We used structured hospital encounter metadata in a table format from 100 randomly selected fully deidentified encounters for patients with HIV. Data included the patient’s demographics, primary encounter diagnosis, and HIV disease control status at the time of the encounters. Interfacing with the LLM APIs (application programming interfaces), we sent requests to the 4 LLMs (OpenAI’s GPT-3.5-turbo, OpenAI’s GPT-4, Google’s Gemini-1.0-pro, and Meta’s Llama3-70b).</p>
        <p>The LLMs were prompted to write discharge instructions for a patient in English based on his/her hospital encounter information from the deidentified dataset. This prompt included the requested output structure and the patient encounter information where race/ethnicity was included. The used prompt is listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>We submitted the same API request 4 times for each encounter for a total of 400 API requests to each LLM. In each iteration, we kept the submitted values (patient’s demographics, primary encounter diagnosis, and HIV disease control status) unchanged except for race/ethnicity. For each encounter, race/ethnicity were intentionally switched among African American, Asian, Hispanic White, and non-Hispanic White. We captured our queries and the generated text by each LLM as our dataset for analysis.</p>
        <p>The examined outcomes included polarity, subjectivity, named entity recognition (NER) counts, readability, word count, and the 10 most frequently used words. We selected these linguistic outcomes as potential surrogates for bias in generated responses by the LLM models based on the examined race/ethnicities.</p>
        <p>We used the en_core_web_sm model of the NLP library spaCy and the sentiment analysis library, TextBlob to perform NER and sentiment analysis on the text in our dataset. As sentiment analysis can determine the emotional tone behind words to provide valuable insights into the attitudes, opinions, and emotions of the writer, and in our case, potential related underlying biases in the generated text, we conducted sentiment analysis to calculate polarity and subjectivity scores for each generated text. Polarity is a float value within the range [–1.0, 1.0], where –1.0 indicates a negative sentiment, and 1.0 a positive sentiment. Values around 0 represent a neutral sentiment. Subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective. Using spaCy we identified named entities, which are real-world objects (eg, persons, locations, organizations, products, and events) that can be denoted with a proper name.</p>
        <p>We used the Python library textstat to evaluate the readability of text responses by the racial group provided as input. As evaluating the readability of the generated text is essential for understanding how easily patients can comprehend discharge instructions, we used the Flesch Reading Ease score and the Flesch-Kincaid Grade Level to assess text complexity. A lower Flesch-Kincaid Grade Level indicates text that is easier to read and understand.</p>
        <p>To explore if the models used certain words more frequently than others based on race/ethnicity, we calculated the word count in the output texts of each race/ethnicity. For preprocessing, we used the CountVectorizer class from the sklearn.feature_extraction.text module. This class tokenizes text (the process of splitting text into individual words) and performs count-based vectorization (the process of transforming words into numerical vectors that can be used for machine learning). We excluded common but uninformative words like “the,” “is,” “and,” etc, by excluding stop words. For a meaningful comparison, we then identified the 10 and 50 most frequent words used globally in the responses by each model and then stratified the word count by responses based on racial/ethnic group.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>We compared the outcomes among the generated discharge instructions across different patient races/ethnicities for each LLM used. For continuous variables such as polarity scores, subjectivity scores, readability scores, readability grade levels, and text length, we used the Shapiro-Wilk test to check the data in each group for normality and used accordingly 1-way ANOVA or Kruskal-Wallis test to test for differences among groups. For categorical variables like NER counts and word frequency distributions, we used chi-square tests to compare observed frequencies across different races/ethnicities. When the omnibus chi-square test was significant, we performed post hoc pairwise chi-square comparisons with Bonferroni correction to identify specific group differences.</p>
        <p>The chi-square test of independence was conducted to assess the relationship between different categorical variables, using the chi2_contingency function from the SciPy library. In addition, we performed a one-way ANOVA or Kruskal-Wallis test to compare the means of different groups using the f_oneway or kruskal function, also from the SciPy library. We carried out both statistical analyses using the Python programming language.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The study was conducted using a fully deidentified dataset that included no personal or protected health identifiers as defined under 45 CFR §46.102(f) of the US Department of Health and Human Services regulations [<xref ref-type="bibr" rid="ref10">10</xref>]. Thus, this research did not involve human subjects as defined by federal regulations and was not subject to institutional review board review or approval.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>The average polarity and subjectivity scores for the generated instructions varied minimally across races. The differences in the average polarity (<xref rid="figure1" ref-type="fig">Figure 1</xref>) of the discharge instructions generated for African American, Asian, Hispanic White, and non-Hispanic White patients examined using Kruskal-Wallis test; H-statistics were 0.94 (<italic>P</italic>=.82) for GPT-3.5-turbo, 3.19 (<italic>P</italic>=.36) for GPT-4, 1.45 (<italic>P</italic>=.70) for Gemini-1.0-pro, and 3.38 (<italic>P</italic>=.34) for Llama3-70b. The differences in the average subjectivity (<xref rid="figure2" ref-type="fig">Figure 2</xref>) of the discharge instructions generated for African American, Asian, Hispanic White, and non-Hispanic White patients examined using ANOVA, F-statistics were 0.04 (<italic>P</italic>=.99) for GPT-3.5-turbo, 0.67 (<italic>P</italic>=.57) for GPT-4, 0.4 (<italic>P</italic>=.76) for Gemini-1.0-pro, and 0.3 (<italic>P</italic>=.82) for Llama3-70b.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Polarity of LLM-generated text by race/ethnicity. LLM: large language model.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e57257_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <fig id="figure2" position="float">
        <label>Figure 2</label>
        <caption>
          <p>Subjectivity of LLM-generated text by race/ethnicity. LLM: large language model.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e57257_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>We observed comparable results for the NER (<xref rid="figure3" ref-type="fig">Figure 3</xref>) with a <italic>χ</italic><sup>2</sup><sub>36</sub>=34.26 (<italic>P</italic>=.55) for GPT-3.5-turbo, <italic>χ</italic><sup>2</sup><sub>42</sub>=58.41 (<italic>P</italic>=.047) for GPT-4, <italic>χ</italic><sup>2</sup><sub>45</sub>=52.15 (<italic>P</italic>=.22) for Gemini-1.0-pro, <italic>χ</italic><sup>2</sup><sub>42</sub>=48.75 (<italic>P</italic>=.22) for Llama3-70b. Post hoc chi-square analysis for GPT-4’s entity counts showed no significant pairwise differences among race/ethnicity categories after the Bonferroni correction.</p>
      <fig id="figure3" position="float">
        <label>Figure 3</label>
        <caption>
          <p>Entity counts of LLM-generated text by race/ethnicity. LLM: large language model.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e57257_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>The readability ease scores and grade levels (<xref rid="figure4" ref-type="fig">Figures 4</xref> and <xref rid="figure5" ref-type="fig">5</xref>) showed no significant differences across the races/ethnicities (readability scores: H=4.01, <italic>P</italic>=.26 for GPT=3.5-turbo; H=0.86, <italic>P</italic>=.83 for GPT-4; H=2.26, <italic>P</italic>=.52 for Gemini-1.0-pro; and H=1.59, <italic>P</italic>=.66 for Llama3-70b; readability grade level: H=3.41, <italic>P</italic>=.33 for GPT-3.5-turbo; H=1.53, <italic>P</italic>=.68 for GPT-4; H=2.26, <italic>P</italic>=.52 for Gemini-1.0-pro; and H=1.41, <italic>P</italic>=.7 for Llama3-70b).</p>
      <fig id="figure4" position="float">
        <label>Figure 4</label>
        <caption>
          <p>Flesch Reading ease of LLM-generated text by race/ethnicity. LLM: large language model.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e57257_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <fig id="figure5" position="float">
        <label>Figure 5</label>
        <caption>
          <p>Flesch Kincaid grade of LLM-generated text by race/ethnicity. LLM: large language model.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e57257_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
      <p>The distribution of the word frequency of the 10 most frequent words used by each model across races/ethnicities did not statistically significantly vary (<italic>χ</italic><sup>2</sup><sub>27</sub>=6.02, <italic>P</italic>&#60;.99 for GPT-3.5-turbo; <italic>χ</italic><sup>2</sup><sub>27</sub>=14.87, <italic>P</italic>=.97 for GPT-4; <italic>χ</italic><sup>2</sup><sub>27</sub>=13.51, <italic>P</italic>=.99 for Gemini-1.5-pro; and <italic>χ</italic><sup>2</sup><sub>27</sub>=12.27, <italic>P</italic>=.99 for Llama3-70b). Similarly, no statistically significant difference was observed for the top 50 words distribution by each model across the examined races/ethnicities (<italic>χ</italic><sup>2</sup><sub>147</sub>=85.11, <italic>P</italic>&#60;.99 for GPT-3.5-turbo; <italic>χ</italic><sup>2</sup><sub>147</sub>=84.09, <italic>P</italic>&#60;.99 for GPT-4; <italic>χ</italic><sup>2</sup><sub>147</sub>=87.21, <italic>P</italic>&#60;.99 for Gemini-1.0-pro; and <italic>χ</italic><sup>2</sup><sub>147</sub>=93.21, <italic>P</italic>&#60;.99 for Llama3-70b; <xref rid="figure6" ref-type="fig">Figure 6</xref>).</p>
      <p>Statistical analysis results have been included in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p>
      <fig id="figure6" position="float">
        <label>Figure 6</label>
        <caption>
          <p>Frequency of top 10 words of LLM-generated text by race/ethnicity. LLM: large language model.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e57257_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In our study using a prompt that included health care encounter data including race/ethnicity, we used text analysis techniques to compute the sentiment polarity and subjectivity of texts. We used NER to identify and categorize proper nouns and other significant terms with the corpus. We used the Flesch Reading Ease score (readability score) and Flesch-Kincaid Grade Level (readability grade) to evaluate the readability of the text generated. We also calculated the most frequently used words of generated text by four popular LLMs: GPT-3.5-turbo, GPT-4, Gemini-1.0-pro, and Llama3-70b. Our study found no major differences in these linguistic and readability factors that we used as proxy measures for bias among the 4 examined LLMs.</p>
        <p>LLMs hold great promise in health care, facilitating various tasks with realistic and knowledge-rich responses. Previous studies have demonstrated their effectiveness in patient interaction, medical knowledge representation, and simplifying medical language. In a study of patient questions posted on social media comparing responses by physicians and a chatbot using an LLM, the bot’s responses were not only preferred over the physicians’ but also ranked higher in empathy and quality [<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <p>LLMs not only produce realistic text responses, but they also encode clinical and other knowledge as demonstrated by ChatGPT performing at or near passing threshold for 3 steps of the United States Medical Licensing Exam and the Clinical Informatics examination [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. ChatGPT has also been successfully used to translate radiology reports into plain language [<xref ref-type="bibr" rid="ref14">14</xref>]. In a study where ChatGPT was presented with advice-seeking vignettes, ChatGPT was found to “consider” social factors like race altering clinical recommendations [<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>With its use in health care–related tasks when first released to the public, the concern of racial and other biases inherent in the LLMs becomes important. When detecting a gender bias by ChatGPT, Brown et al [<xref ref-type="bibr" rid="ref16">16</xref>] stated “Internet-trained models have internet-scale biases.” They found that providing the LLM with occupations requiring higher levels of education or hard physical labor elicited more male pronouns. Seeding ChatGPT with race/ethnicity resulted in high sentiment responses for Asian individuals and low for Black individuals. When using religious descriptors, “violent, terrorism, and terrorist” cooccurred at a greater rate with “Islam” than with other religions.</p>
        <p>By now, most LLM developers have locked their tools against task requests that are obviously seeking to elicit bias. To circumnavigate these blocks, we analyzed health care–related text generated in simple terms by 4 LLMs where the prompts were identical except for race/ethnicity without obviously seeking to elicit bias in the prompt. The NLP linguistic factors that we used in our study as proxy measures for bias were not vastly different based on race/ethnicity. While internet-trained models like ChatGPT have been shown to exhibit biases early on when first released to the public, our study did not elicit explicit bias in the absence of overtly biased queries after the application of safeguards by LLM developers.</p>
        <p>We used specific tools in this study to linguistically examine the models’ outputs. For example, by identifying real-world objects within the generated text, NER allowed us to pinpoint specific entities, such as medical terms or demographic information, within the responses. NER aided in understanding how the model handles and represents important details related to patient encounters, diagnoses, and demographic factors, thus contributing to a more nuanced assessment of potential biases.</p>
        <p>In addition, we conducted a readability and sentiment analysis to understand if the models tailor their responses differently to various racial groups in terms of text complexity. Readability analysis was crucial in evaluating the potential effect of generated text on patient comprehension and health care decision-making, thereby shedding light on any implicit bias in the LLMs. Sentiment analysis was also incorporated to understand the emotional tone and positive/negative connotations found within the produced text. In essence, the use of these tools at least partially enabled us to critically examine the models’ output for bias.</p>
        <p>Our study observed that the text generated by 4 popular LLMs exhibited no major differences across most examined surrogate linguistic metrics for racial/ethnic bias in generated responses. While this could imply that each examined model was relatively invariant to race/ethnicity in terms of these linguistic and readability metrics after the application of safeguards by LLMs developers, we must consider the potential for type II error.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Several limitations of this study warrant consideration. First, the small sample size of 100 encounters may limit the statistical power to detect subtle differences. Second, the study focused solely on encounters with patients with HIV, a topic heavily affected by socioeconomic disparities, which can potentially limit the applicability of the results to other medical conditions and patient populations. Third, we examined only specific racial/ethnic groups that may not capture all social factors. Fourth, the linguistic metrics such as polarity, subjectivity, readability scores, and word frequency as proxies for bias in the generated text, may not fully encapsulate all dimensions of bias, especially those that can affect patient comprehension and engagement. Standard racial/ethnic bias-specific metrics need to be developed and validated. Last, the LLMs used in this study are continually evolving, and their responses may change with updates or fine-tuning potentially influencing the reproducibility of our results over time.</p>
        <p>The nuanced understanding of bias in artificial intelligence (AI), as evidenced by our study design and findings, underscores the critical role of technology in shaping patient interactions and treatment outcomes. By discussing instances where artificial intelligence responses may or may not vary by race, we can guide the development and deployment of more equitable artificial intelligence systems. These improvements are vital for ensuring that all patient groups receive clear, understandable, and unbiased information when using large language models, which is crucial for informed health care decision-making and equitable treatment.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>Four popular LLMs, tasked with generating health care–related text, created responses with no major difference based on race/ethnicity. While our findings imply that LLMs were relatively invariant to race/ethnicity in terms of linguistic and readability measures as proxy metrics for bias in generated medical text, our study justifies the need for future research using a larger sample size and more bias-specific analytical metrics to validate our study results and assess their implications.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Prompt used in the study.</p>
        <media xlink:href="jmir_v27i1e57257_app1.docx" xlink:title="DOCX File , 12 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Statistical analysis for each model metrics by race/ethnicity.</p>
        <media xlink:href="jmir_v27i1e57257_app2.docx" xlink:title="DOCX File , 53 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">NER</term>
          <def>
            <p>named entity recognition</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">NLP</term>
          <def>
            <p>natural language processing</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>Research reported in this publication was supported by the National Center for Advancing Translational Sciences of the National Institutes of Health (award UL1 TR003163). The content is solely the responsibility of the authors and does not necessarily represent the official views of the National Institutes of Health. The initial draft of a few paragraphs in this manuscript utilized generative AI (ChatGPT-4, OpenAI, 2024), and were heavily reviewed, edited, and harmonized with the original manuscript paragraphs by ADW and JJH.</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>JJH provided the analysis and the visualization with a critical review by the other authors.</p>
      </fn>
      <fn fn-type="conflict">
        <p>JJH has provided clinical AI consultations to Pieces Technologies, Inc.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Thirunavukarasu</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSJ</given-names>
            </name>
            <name name-style="western">
              <surname>Elangovan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gutierrez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>TF</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Large language models in medicine</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <volume>29</volume>
          <issue>8</issue>
          <fpage>1930</fpage>
          <lpage>1940</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id>
          <pub-id pub-id-type="medline">37460753</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02448-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Solomonides</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Koski</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Atabaki</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Weinberg</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McGreevey</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Kannry</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Petersen</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>CU</given-names>
            </name>
          </person-group>
          <article-title>Defining AMIA's artificial intelligence principles</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2022</year>
          <volume>29</volume>
          <issue>4</issue>
          <fpage>585</fpage>
          <lpage>591</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35190824"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocac006</pub-id>
          <pub-id pub-id-type="medline">35190824</pub-id>
          <pub-id pub-id-type="pii">6534106</pub-id>
          <pub-id pub-id-type="pmcid">PMC8922174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Esteva</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Robicquet</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ramsundar</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kuleshov</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>DePristo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Thrun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dean</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A guide to deep learning in healthcare</article-title>
          <source>Nat Med</source>
          <year>2019</year>
          <volume>25</volume>
          <issue>1</issue>
          <fpage>24</fpage>
          <lpage>29</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-018-0316-z</pub-id>
          <pub-id pub-id-type="medline">30617335</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-018-0316-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>180</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bolukbasi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Saligrama</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Man is to computer programmer as woman is to homemaker? debiasing word embeddings</article-title>
          <source>ArXiv. Preprint posted online on July 21, 2016</source>
          <year>2016</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1607.06520"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hirani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Farabi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Marmon</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Experimenting with ChatGPT: concerns for academic medicine</article-title>
          <source>J Am Acad Dermatol</source>
          <year>2023</year>
          <volume>89</volume>
          <issue>3</issue>
          <fpage>e127</fpage>
          <lpage>e129</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jaad.2023.04.045</pub-id>
          <pub-id pub-id-type="medline">37179029</pub-id>
          <pub-id pub-id-type="pii">S0190-9622(23)00747-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="web">
          <article-title>GPT-3.5-turbo- 0613 [Large language model]</article-title>
          <source>OpenAI</source>
          <access-date>2025-02-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://chat.openai.com">https://chat.openai.com</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="web">
          <article-title>Gemini 1.0 Pro</article-title>
          <source>Google</source>
          <access-date>2025-02-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemini-pro?pli=1">https://console.cloud.google.com/vertex-ai/publishers/google/model-garden/gemini-pro?pli=1</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing Meta Llama3: the most capable openly available LLM to date</article-title>
          <source>Meta</source>
          <access-date>2024-04-18</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ai.meta.com/blog/meta-llama-3/">https://ai.meta.com/blog/meta-llama-3/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="web">
          <article-title>Revised Common Rule</article-title>
          <source>U.S. Department of Health and Human Service</source>
          <year>2018</year>
          <access-date>2024-12-15</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.hhs.gov/ohrp/regulations-and-policy/regulations/finalized-revisions-common-rule/index.html">https://www.hhs.gov/ohrp/regulations-and-policy/regulations/finalized-revisions-common-rule/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Poliak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leas</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Faix</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Hogarth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title>
          <source>JAMA Intern Med</source>
          <year>2023</year>
          <volume>183</volume>
          <issue>6</issue>
          <fpage>589</fpage>
          <lpage>596</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37115527"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id>
          <pub-id pub-id-type="medline">37115527</pub-id>
          <pub-id pub-id-type="pii">2804309</pub-id>
          <pub-id pub-id-type="pmcid">PMC10148230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumah-Crystal</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mankowitz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Embi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>CU</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and the clinical informatics board examination: the end of unproctored maintenance of certification?</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2023</year>
          <volume>30</volume>
          <issue>9</issue>
          <fpage>1558</fpage>
          <lpage>1560</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37335851"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocad104</pub-id>
          <pub-id pub-id-type="medline">37335851</pub-id>
          <pub-id pub-id-type="pii">7202064</pub-id>
          <pub-id pub-id-type="pmcid">PMC10436139</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zapadka</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Ponnatapura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Niu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Myers</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Whitlow</surname>
              <given-names>CT</given-names>
            </name>
          </person-group>
          <article-title>Translating radiology reports into plain language using ChatGPT and GPT-4 with prompt learning: results, limitations, and potential</article-title>
          <source>Vis Comput Ind Biomed Art</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>9</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37198498"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s42492-023-00136-5</pub-id>
          <pub-id pub-id-type="medline">37198498</pub-id>
          <pub-id pub-id-type="pii">10.1186/s42492-023-00136-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10192466</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nastasi</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Courtright</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Halpern</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Weissman</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>A vignette-based evaluation of ChatGPT's ability to provide appropriate and equitable medical advice across care contexts</article-title>
          <source>Sci Rep</source>
          <year>2023</year>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>17885</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-023-45223-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-023-45223-y</pub-id>
          <pub-id pub-id-type="medline">37857839</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-023-45223-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC10587094</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Herbert-Voss</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krueger</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Henighan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ziegler</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winter</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hesse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sigler</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Litwin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chess</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Berner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>McCandlish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Amodei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <source>Arxiv</source>
          <year>2005</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.48550/arXiv.2005.14165"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
