<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="letter"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e82256</article-id><article-id pub-id-type="doi">10.2196/82256</article-id><article-categories><subj-group subj-group-type="heading"><subject>Research Letter</subject></subj-group></article-categories><title-group><article-title>Evaluating the Potential of Reasoning Large Language Models to Perpetuate Racial and Gender Disease Stereotypes in Health Care</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Docking</surname><given-names>Joshua J</given-names></name><degrees>BMedSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Lee X</given-names></name><degrees>MBioStats</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Menz</surname><given-names>Bradley D</given-names></name><degrees>BPharm</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bacchi</surname><given-names>Stephen</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hopkins</surname><given-names>Ashley M</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Sorich</surname><given-names>Michael J</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>College of Medicine and Public Health, Flinders University</institution><addr-line>GPO Box 2100</addr-line><addr-line>Adelaide</addr-line><addr-line>SA</addr-line><country>Australia</country></aff><aff id="aff2"><institution>Adelaide Medical School, Adelaide University</institution><addr-line>Adelaide</addr-line><addr-line>SA</addr-line><country>Australia</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Long</surname><given-names>Shi-yu</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Michael J Sorich, PhD, College of Medicine and Public Health, Flinders University, GPO Box 2100, Adelaide, SA, 5001, Australia, 61 (08) 82013217; <email>michael.sorich@flinders.edu.au</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>28</day><month>5</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e82256</elocation-id><history><date date-type="received"><day>09</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>08</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Joshua J Docking, Lee X Li, Bradley D Menz, Stephen Bacchi, Ashley M Hopkins, Michael J Sorich. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 28.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e82256"/><abstract><p>This evaluation of 36,000 clinical vignettes found that next-generation reasoning large language models, o3-mini and DeepSeek-R1, frequently perpetuate racial and gender stereotypes for common medical conditions, indicating that advancements in reasoning do not inherently improve representational fairness.</p></abstract><kwd-group><kwd>large language model</kwd><kwd>reasoning LLM</kwd><kwd>artificial intelligence</kwd><kwd>bias</kwd><kwd>stereotypes</kwd><kwd>fairness</kwd><kwd>health equity</kwd><kwd>race</kwd><kwd>gender</kwd><kwd>representation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) have the potential to transform health care but risk exacerbating health disparities if they perpetuate biases [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Zack and colleagues [<xref ref-type="bibr" rid="ref4">4</xref>] demonstrated potential racial and gender biases in clinical vignettes generated by GPT-4, including overrepresentation of Black patients in stereotypical medical conditions. Since then, next-generation reasoning LLMs have emerged, offering improved reasoning capability (&#x201C;thinking&#x201D; before answering), with this model class demonstrating superior benchmark performance [<xref ref-type="bibr" rid="ref5">5</xref>]. Whether this will reduce representational bias in health care remains unknown. This study evaluated whether reasoning LLMs exhibit racial and gender biases in generated clinical content.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>Using the methods of Zack et al [<xref ref-type="bibr" rid="ref4">4</xref>], two prominent reasoning LLMs of distinct geographic origin, o3-mini (OpenAI) and DeepSeek-R1 (DeepSeek; 671B full model), generated patient cases across 18 medical conditions that represent a spectrum of demographic-prevalence relationships (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), specifying a US population, using 10 prompt variations (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), and running 100 times each. Patient demographic characteristics in the generated vignettes were extracted using the methods of Zack et al [<xref ref-type="bibr" rid="ref4">4</xref>], and the proportion of race and gender representation for each condition was calculated. A qualitative analysis of DeepSeek-R1&#x2019;s reasoning traces was also performed on a random sample of 20 vignettes (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Misrepresentation (LLM estimate minus the published US epidemiological estimates [<xref ref-type="bibr" rid="ref4">4</xref>]) was summarized as the median (range) across 18 medical conditions. For example, if 60% of LLM-generated HIV cases were Black patients, compared to the 40% of Black patients reported in representative US studies of HIV (Table S2, <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), misrepresentation would be +20%. Positive values indicate overrepresentation, and negative values indicate underrepresentation. A difference greater than 20% was considered the threshold for significant misrepresentation, indicating a practically meaningful deviation in demographic representation. Sensitivity analyses using 10% and 30% thresholds confirmed that findings were robust to threshold selection (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). <italic>&#x03C7;</italic><sup>2</sup> goodness-of-fit tests with Benjamini-Hochberg false-discovery rate correction were used to assess whether the LLM-generated demographic distributions differed significantly from epidemiological baselines for each condition (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3" sec-type="results"><title>Results</title><p>A total of 36,000 unique clinical vignettes were generated. Pairwise word-level Jaccard similarity confirmed substantive diversity, with a mean within-group similarity of 0.35 (SD 0.06 for DeepSeek-R1 and 0.08 for o3-mini) and near-duplicate pairs (Jaccard &#x003E;0.8) constituting fewer than 0.1% of comparisons (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Median misrepresentation for o3-mini was 44% (range &#x2212;12% to +75%) for Black, &#x2212;4.6% (range &#x2212;37% to 0%) for Asian, &#x2212;14% (range &#x2212;27% to +0.7%) for Hispanic, and &#x2212;8.2% (range &#x2212;56% to +33%) for White persons (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Median misrepresentation for DeepSeek-R1 was 31% (range &#x2212;21% to +81%) for Black, &#x2212;4.4% (range &#x2212;35% to +47%) for Asian, &#x2212;8.8% (range &#x2212;26% to +53%) for Hispanic, and &#x2212;21% (range &#x2212;63% to +41%) for White persons (<xref ref-type="fig" rid="figure2">Figure 2</xref>). For 78% (14/18) of medical conditions using o3-mini and 89% (16/18) using DeepSeek-R1, there was more than 20% misrepresentation for at least one race. <italic>&#x03C7;</italic><sup>2</sup> goodness-of-fit tests confirmed that the racial distributions generated by both models differed significantly from epidemiological baselines for all 18 conditions (all Benjamini-Hochberg corrected <italic>P</italic>&#x003C;.001; Table S5, <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Proportional representation of disease cases by race and gender in o3-mini&#x2013;generated clinical content compared with published US statistics. Blue dot on the right of red diamond indicates overrepresentation by the large language model. Blue dot on the left indicates underrepresentation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e82256_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Proportional representation of disease cases by race and gender in DeepSeek R1&#x2013;generated clinical content compared with published US statistics. Blue dot on the right of red diamond indicates overrepresentation by the large language model. Blue dot on the left indicates underrepresentation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e82256_fig02.png"/></fig><p>Female median misrepresentation was &#x2212;27% (range &#x2212;47% to +31%) for o3-mini and &#x2212;23% (range &#x2212;47% to +33%) for DeepSeek-R1. For 56% (10/18) of medical conditions using o3-mini and 67% (12/18) using DeepSeek-R1, there was more than 20% misrepresentation. There was a consistent overrepresentation of the gender with the higher published representation. Gender distributions also differed significantly from epidemiological baselines for all non&#x2013;sex-linked conditions (all Benjamini-Hochberg corrected <italic>P</italic>&#x003C;.001; Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><p>Reasoning LLMs, such as o3-mini and DeepSeek-R1, frequently misrepresent the distribution of race and gender in medical conditions, mirroring issues previously observed in GPT-4 [<xref ref-type="bibr" rid="ref4">4</xref>], which met the threshold for significant misrepresentation in 67% (12/18) of conditions for race and 67% (12/18) for gender [<xref ref-type="bibr" rid="ref4">4</xref>]. Our results show comparable or higher rates for o3-mini (78% race, 56% gender) and DeepSeek-R1 (89% race, 67% gender), indicating no improvement in representation with the newer reasoning models.</p><p>Both o3-mini and DeepSeek-R1, like GPT-4, overrepresented Black populations in stereotypically associated conditions (eg, sarcoidosis, systemic lupus erythematosus, pre-eclampsia, essential hypertension) [<xref ref-type="bibr" rid="ref4">4</xref>], with even higher median misrepresentation of 44% and 31%, respectively, compared to 15% in GPT-4 [<xref ref-type="bibr" rid="ref4">4</xref>]. This persistent pattern may reflect underlying bias, though models may also default to generating prototypical cases rather than representative samples due to patterns in their training data. Qualitative analysis of DeepSeek-R1&#x2019;s reasoning traces supports this, revealing that the model explicitly invoked disease-demographic associations (eg, &#x201C;more prevalent in&#x201D;) when selecting patient demographics, without referencing quantitative epidemiological data (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This explicit demographic deliberation may also explain the higher misrepresentation observed in the reasoning models included in this study compared to GPT-4, as the extended reasoning process may amplify stereotypical associations by actively invoking them during generation. In either case, consistently overrepresenting certain demographic groups, particularly for conditions that in practice affect diverse populations, risks reinforcing narrowed demographic assumptions in clinical contexts where understanding disease prevalence across populations is an important component of diagnostic reasoning. Similarly, the consistent exaggeration of the majority gender aligns with previous findings showing LLM outputs skew toward gender stereotypes in health care roles, which could further marginalize minority genders [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>This study&#x2019;s strengths include its evaluation of next-generation reasoning LLMs and the robust assessment from 36,000 generated clinical vignettes. Limitations include our focus on a US context, and that DeepSeek-R1&#x2019;s development outside the United States may mean that deviations partly reflect differences in training data representation, though the similar directional patterns between both models suggest shared stereotypical associations. The demographic categories were also adopted from Zack et al [<xref ref-type="bibr" rid="ref4">4</xref>] to enable direct comparison but do not capture Native American, multiracial, nonbinary, or transgender populations. and treat &#x201C;Hispanic&#x201D; as a race rather than an ethnicity. Additionally, given the rapid evolution of the LLM landscape, GPT-4 comparisons are based on published data from Zack et al [<xref ref-type="bibr" rid="ref4">4</xref>] rather than a concurrent control run with identical prompts, which limits definitive comparative conclusions. However, the present study&#x2019;s inclusion of explicit US geographic context in prompts would, if anything, be expected to reduce deviations from US epidemiological baselines, suggesting the comparison is conservative. Further detail on comparability is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The qualitative analysis was exploratory and based on a small sample, limiting the generalizability of the mechanistic observations. Future research should assess generalizability across different regions and broader condition sets, and evaluate whether prompt-level strategies can mitigate the observed patterns. For example, explicit instructions to reflect epidemiological distributions or providing previously generated cases as context to encourage demographic diversity across outputs may help shift generation from prototypical to representative cases, particularly given our finding that the model explicitly deliberates about demographic associations during generation. Further research should also directly evaluate whether generation biases in LLMs translate to impacts on clinical decision-making and potential patient harms.</p><p>In conclusion, despite enhanced reasoning capabilities, the clinical outputs of o3-mini and DeepSeek-R1 still exhibit racial and gender disease stereotyping in common medical conditions. Advancements in LLM capabilities do not guarantee parallel improvements across all dimensions [<xref ref-type="bibr" rid="ref7">7</xref>], including, as demonstrated here, fairness and representation in health care. Awareness of these demographic defaults is essential for the safe integration of LLMs into clinical workflows, and continuous monitoring of potential biases should accompany their adoption.</p></sec></body><back><ack><p>ChatGPT 4o (OpenAI) and Gemini 2.5 Pro (Google) were used to assist in formatting and editing the manuscript. The authors reviewed and verified all artificial intelligence&#x2013;assisted content.</p></ack><notes><sec><title>Funding</title><p>MJS is supported by a Beat Cancer Research Fellowship from the Cancer Council South Australia (PRF2719). AMH holds an Emerging Leader Investigator Fellowship from the National Health and Medical Research Council, Australia (APP2008119). The PhD scholarship of BDM is supported by the National Health and Medical Research Council, Australia (APP2030913). The funders had no role in the design and conduct of the study; collection, management, analysis, and interpretation of the data; preparation, review, or approval of the manuscript; and decision to submit the manuscript for publication.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: MJS</p><p>Methodology: MJS</p><p>Software: JJD</p><p>Validation: JJD, LXL, BDM, SB, AMH, MJS</p><p>Formal analysis: JJD, MJS</p><p>Investigation: JJD</p><p>Resources: MJS</p><p>Data curation: JJD</p><p>Writing &#x2013; original draft: JJD</p><p>Writing &#x2013; review &#x0026; editing: LXL, BDM, SB, AMH, MJS</p><p>Visualization: JJD</p><p>Supervision: MJS</p><p>Project administration: JJD, MJS</p><p>Funding acquisition: MJS</p></fn><fn fn-type="conflict"><p>MJS reported receiving grants from Pfizer, AstraZeneca, Boehringer Ingelheim, and the National Health and Medical Research Council of Australia outside the submitted work. AMH reported receiving grants from Boehringer Ingelheim, Hospital Research Foundation, Tour De Cure, and Flinders Foundation outside the submitted work. No other disclosures were reported.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Weidinger</surname><given-names>L</given-names> </name><name name-style="western"><surname>Uesato</surname><given-names>J</given-names> </name><name name-style="western"><surname>Rauh</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Taxonomy of risks posed by language models</article-title><conf-name>FAccT &#x2019;22: 2022 ACM Conference on Fairness, Accountability, and Transparency</conf-name><conf-date>Jun 21-24, 2022</conf-date><pub-id pub-id-type="doi">10.1145/3531146.3533088</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adam</surname><given-names>H</given-names> </name><name name-style="western"><surname>Balagopalan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name><name name-style="western"><surname>Christia</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ghassemi</surname><given-names>M</given-names> </name></person-group><article-title>Mitigating the impact of biased artificial intelligence in emergency decision-making</article-title><source>Commun Med (Lond)</source><year>2022</year><month>11</month><day>21</day><volume>2</volume><issue>1</issue><fpage>149</fpage><pub-id pub-id-type="doi">10.1038/s43856-022-00214-4</pub-id><pub-id pub-id-type="medline">36414774</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pfohl</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Cole-Lewis</surname><given-names>H</given-names> </name><name name-style="western"><surname>Sayres</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A toolbox for surfacing health equity harms and biases in large language models</article-title><source>Nat Med</source><year>2024</year><month>12</month><volume>30</volume><issue>12</issue><fpage>3590</fpage><lpage>3600</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03258-2</pub-id><pub-id pub-id-type="medline">39313595</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lehman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Suzgun</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study</article-title><source>Lancet Digit Health</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e12</fpage><lpage>e22</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00225-X</pub-id><pub-id pub-id-type="medline">38123252</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Brodeur</surname><given-names>PG</given-names> </name><name name-style="western"><surname>Buckley</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Superhuman performance of a large language model on the reasoning tasks of a physician</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 14, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.10849</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Menz</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Kuderer</surname><given-names>NM</given-names> </name><name name-style="western"><surname>Chin-Yee</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Gender representation of health care professionals in large language model-generated stories</article-title><source>JAMA Netw Open</source><year>2024</year><month>09</month><day>3</day><volume>7</volume><issue>9</issue><fpage>e2434997</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.34997</pub-id><pub-id pub-id-type="medline">39312237</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cui</surname><given-names>DX</given-names> </name><name name-style="western"><surname>Long</surname><given-names>SY</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>YX</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name></person-group><article-title>Can reasoning power significantly improve the knowledge of large language models for chemistry?&#x2500;Based on conversations with LLMs</article-title><source>J Chem Inf Model</source><year>2025</year><month>09</month><day>22</day><volume>65</volume><issue>18</issue><fpage>9516</fpage><lpage>9527</lpage><pub-id pub-id-type="doi">10.1021/acs.jcim.5c01265</pub-id><pub-id pub-id-type="medline">40854079</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Methods, model details, data sources, analyses, and vignette uniqueness.</p><media xlink:href="jmir_v28i1e82256_app1.pdf" xlink:title="PDF File, 245 KB"/></supplementary-material></app-group></back></article>