<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e85770</article-id><article-id pub-id-type="doi">10.2196/85770</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Patient Cognitive Bias in Large Language Model&#x2013;Supported Health Consultations: Simulation-Based Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zuo</surname><given-names>Yi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wan</surname><given-names>Qifeng</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wang</surname><given-names>Shalong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>School of Computer Science and Artificial Intelligence, Hunan University of Finance and Economics</institution><addr-line>Changsha</addr-line><addr-line>Hunan</addr-line><country>China</country></aff><aff id="aff2"><institution>Hunan Green Development Research Institute, School of Economics and Management, Central South University of Forestry and Technology</institution><addr-line>Changsha</addr-line><addr-line>Hunan</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of General Surgery, Second Xiangya Hospital of Central South University</institution><addr-line>139 Renmin Middle Road</addr-line><addr-line>Changsha</addr-line><addr-line>Hunan</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chrimes</surname><given-names>Dillon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Irene</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Kuan-Hsun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Shalong Wang, MD, Department of General Surgery, Second Xiangya Hospital of Central South University, 139 Renmin Middle Road, Changsha, Hunan, 410011, China, 86 073185295167; <email>wangshalong@csu.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>11</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e85770</elocation-id><history><date date-type="received"><day>13</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>02</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>04</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yi Zuo, Qifeng Wan, Shalong Wang. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 11.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e85770"/><abstract><sec><title>Background</title><p>Large language models (LLMs) are increasingly used by patients for health information and preliminary medical advice. In patient-facing consultations, users may present explicitly stated diagnostic preferences or symptom narratives emphasizing a preferred explanation. Such cognitively biased input constrains the diagnostic context available to the model and may systematically steer its reasoning during interactive LLM-supported health consultations.</p></sec><sec><title>Objective</title><p>This study aimed to quantify the impact of patient cognitive bias on LLM diagnostic performance in multiturn consultations, assess the effectiveness of prompt-based mitigation strategies and decoding temperature adjustment, and evaluate a dual-system framework for improving robustness under biased interaction.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed a simulated patient agent to generate both unbiased and cognitively biased consultations using 1273 medical question answering dataset United States Medical Licensing Examination cases. Six widely used LLMs of varying capacities were evaluated through 3-round, multiturn dialogues, after which each model produced a final diagnostic judgment based on the complete consultation record. Diagnostic accuracy was the primary outcome. Secondary outcomes included bias-induced accuracy decline (absolute reduction in accuracy under biased vs standard consultations) and bias-influenced error proportion (proportion of incorrect responses aligned with the patient&#x2019;s preferred but incorrect diagnosis). Three prompt-based mitigation strategies and 4 decoding temperature settings were tested. In addition, a dual-system framework was evaluated, in which a conversational foundation LLM conducted patient interaction and history taking (System 1), while a reasoning-oriented LLM (o1-mini) generated the final diagnostic judgment (System 2). In the foundation-only condition, the same LLM performed both interaction and diagnosis.</p></sec><sec sec-type="results"><title>Results</title><p>Across all 6 evaluated models, cognitively biased consultations led to marked diagnostic accuracy declines of approximately 7 to 39 percentage points compared with standard multiturn consultations, whereas static single-response tests and standard consultations showed comparable accuracy. Larger deteriorations were observed in lower-capacity models, with some approaching random-guess performance under bias. Errors were frequently aligned with patient bias, with bias-influenced error proportion exceeding one-third across models, indicating systematic conformity rather than random error. Prompt-based mitigation strategies and decoding temperature reduction yielded limited and inconsistent improvements and did not reliably prevent bias-induced performance loss. By contrast, the dual-system framework substantially improved diagnostic accuracy under biased conditions, producing gains of approximately 10 to 39 percentage points across most models and recovering a large proportion of the performance lost due to bias, particularly in lower-capacity systems.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Patient-driven cognitive bias represents an underrecognized behavioral risk in LLM-supported health consultations. Common mitigation approaches, such as prompt engineering or decoding parameter adjustment, provide limited resilience. Explicitly separating conversational interaction from deliberative diagnostic reasoning through a dual-system framework enables more robust diagnostic performance under biased input while potentially preserving patient-facing dialogue fluency by retaining the foundation LLM as the conversational component, offering a scalable design strategy for safer medical AI systems.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>cognitive bias</kwd><kwd>artificial intelligence</kwd><kwd>health information seeking</kwd><kwd>clinical consultation</kwd><kwd>human-AI interaction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The rapid diffusion of conversational artificial intelligence (AI) is reshaping how people access health information. Large language models (LLMs) such as ChatGPT and Gemini increasingly serve as informal health advisors&#x2014;interpreting test results, answering symptom queries, and suggesting treatments. Recent national surveys show that 17% of US adults use AI chatbots monthly for health advice [<xref ref-type="bibr" rid="ref1">1</xref>], 9.9% of Australians have sought medical information from ChatGPT in the past 6 months [<xref ref-type="bibr" rid="ref2">2</xref>], and 21.5% of US respondents reported using ChatGPT for online health information [<xref ref-type="bibr" rid="ref3">3</xref>]. This widespread adoption marks a public health&#x2013;scale shift in how patients prepare for clinical encounters and form preliminary diagnostic beliefs.</p><p>Patients commonly arrive at clinics with self-formed or partial diagnoses&#x2014;a routine feature of modern health-seeking behavior. Such self-diagnosis often reflects underlying cognitive biases, particularly confirmation bias, which is the tendency to favor evidence supporting preexisting beliefs while disregarding contradictions [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref6">6</xref>]. When such bias originates from patients and interacts with LLMs, its effects can be amplified. Because LLMs are highly sensitive to input framing [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>], they may mirror or even reinforce users&#x2019; misconceptions instead of correcting them&#x2014;creating a feedback loop that strengthens erroneous self-diagnoses and distorts subsequent decision-making. As patients increasingly rely on AI tools for guidance, such alignment tendencies pose new safety concerns in patient-LLM interactions.</p><p>Despite the rapid integration of LLMs into virtual consultations [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], health education [<xref ref-type="bibr" rid="ref11">11</xref>], and clinical decision support [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>], existing evaluations remain largely model-centric. They emphasize architecture and training data rather than behavioral variability that shapes real-world dialogues. Regulatory bodies have begun to recognize this oversight: the US Food and Drug Administration has called for incorporating user behavioral factors in AI assessments [<xref ref-type="bibr" rid="ref16">16</xref>], and the World Health Organization&#x2019;s 2024 Guidance on the Ethics and Governance of Large Multimodal Models highlights the need for transparent oversight and inclusive governance to safeguard equitable use [<xref ref-type="bibr" rid="ref17">17</xref>]. However, systematic evaluation of how patient-driven bias influences LLM reasoning remains absent.</p><p>To address this gap, we developed a simulation framework that models both unbiased and cognitively biased consultations using an LLM-powered patient agent, enabling controlled evaluation of diagnostic performance under behavioral distortion. Building on dual-process cognitive theory [<xref ref-type="bibr" rid="ref18">18</xref>], we further propose a dual-system architecture in which a foundation LLM serves as &#x201C;System 1&#x201D; for efficient, natural dialogue and a reasoning-oriented LLM acts as &#x201C;System 2&#x201D; for deliberate diagnostic judgment.</p><p>The aim of this study was to identify patient cognitive bias as a user-driven risk factor in patient-LLM interactions, distinct from model architecture or data quality. Our findings show that LLMs often align with patient misconceptions, amplifying the risk of erroneous understanding and decision-making. To achieve this aim, we introduce a reproducible simulation framework to quantify diagnostic performance under bias-influenced interactions and evaluate whether a dual-system design&#x2014;integrating a conversational foundation model for efficient dialogue with a reasoning model for analytical judgment&#x2014;enhances resilience to cognitive bias while aiming to maintain conversational fluency by retaining the foundation model for patient-facing interaction. By systematically incorporating behavioral variability into evaluation, this work extends current evidence on the reliability of medical LLMs and provides design considerations for safer, bias-aware patient-facing health applications.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Evaluation Setup and Medical Question Answering Benchmark</title><p>We evaluated the clinical judgment performance of LLMs using 2 complementary approaches: (1) multiturn simulated patient-LLM interactions and (2) single-response tests. All evaluations were automated via a Python 3.13.0&#x2013;based script to ensure consistency and reproducibility (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the study workflow and evaluation framework. LLM: large language models; MedQA-USMLE: medical question answering dataset United States Medical Licensing Examination.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e85770_fig01.png"/></fig><p>The benchmark dataset was the MedQA-USMLE (medical question answering dataset United States Medical Licensing Examination), which contains 1273 multiple-choice clinical cases assessing diagnostic accuracy, treatment planning, and overall clinical judgment. Each case comprises a clinical scenario, a question, 4 answer options (1 correct and 3 distractors), and the reference answer. Before generating cognitively biased patient dialogues, we used a separate GPT-4o-Mini&#x2013;based scoring step to evaluate incorrect options based on their potential to serve as plausible but misleading patient explanations and selected the highest-scoring distractor to condition the biased simulation. When multiple incorrect options received the same highest score, the final selection among the tied options was performed randomly.</p></sec><sec id="s2-2"><title>Simulated Patient Agent Design Framework</title><p>LLMs have been shown to simulate human-like behavior and cognitive bias in controlled settings [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Building on these findings, we developed a structured simulated patient agent that models both standard and biased consultations by manipulating information emphasis and framing rather than omitting diagnostic content. This framework enables controlled evaluation of LLM diagnostic performance under behaviorally biased interactions (<xref ref-type="fig" rid="figure2">Figure 2</xref>; <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Framework of simulated patient&#x2013;large language models (LLMs) interactions under standard and cognitively biased conditions.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e85770_fig02.png"/></fig><p>The framework illustrates how a simulated patient agent, powered by GPT-4o-Mini, engages with foundation LLMs under 2 distinct interaction patterns. In standard interactions, the patient provides a comprehensive and accurate account of their condition. In cognitively biased interactions, the patient adopts a preferred but incorrect option (selected from MedQA distractors) and selectively emphasizes and interprets information in a manner consistent with this incorrect self-diagnosis, while potentially contradictory information remains present but is downweighted. Each foundation LLM responds using either standard role prompts or mitigation-strategy prompts, depending on the evaluation condition.</p><p>The protocol standardizes patient presentations and controls key variables to ensure comparability across models, with 2 core components:</p><list list-type="order"><list-item><p>Clinical scenario context: The patient agent&#x2019;s description is derived directly from MedQA test scenarios, ensuring that clinically relevant information is preserved and that no irrelevant or fabricated content is introduced across both standard and cognitively biased interaction modes.</p></list-item><list-item><p>Interaction patterns: (1) Standard patient provides a comprehensive, unbiased account of their condition. (2) Cognitively biased patient adopts a preferred but incorrect diagnostic explanation or MedQA distractor generated by GPT-4o-Mini based on an incorrect MedQA answer. This patient selectively emphasizes and interprets information consistent with the incorrect self-diagnosis and maintains this belief throughout the interaction, while potentially contradictory information remains present but is downweighted in the patient&#x2019;s narrative. This interaction pattern challenges the LLM&#x2019;s diagnostic reasoning by introducing biased framing rather than information insufficiency.</p></list-item></list><p>The framework generates distinct behavioral patterns from identical clinical scenarios, uses natural, first-person patient language, and avoids any disclosure of AI identity to preserve realism. Together, these constraints ensured that the patient agent operated as a controlled simulation component rather than an unconstrained conversational model.</p><p>After evaluating candidate models, GPT-4o-Mini was selected to implement the patient agent because it balances cost-effectiveness and response speed with the ability to generate clinically coherent, realistic interactions, while also consistently reproducing bias-aligned behaviors (eg, selective emphasis, biased interpretation, and repetition), making it well suited for simulating cognitively biased patients. To ensure consistency and reproducibility, the patient agent was operated under constrained instructions using predefined clinical information only, with fixed model parameters (temperature=1) across simulations to maintain consistent behavioral patterns while allowing natural conversational variability. This design was intended to simulate a strong self-diagnosis framing condition for controlled evaluation, rather than the full range of patient bias observed in real consultations.</p></sec><sec id="s2-3"><title>Multiturn Patient-LLM Consultation Simulation</title><p>The simulated consultation begins with the patient agent presenting a detailed clinical case, including demographics, medical history, symptoms, and diagnostic findings, to establish the context for the interaction. To maintain realism, the evaluated LLMs do not have direct access to the original MedQA scenario text; instead, they must elicit and interpret relevant information through dialogue. Each consultation unfolds over 3 rounds of interaction with the patient agent (<xref ref-type="fig" rid="figure2">Figure 2</xref>), during which the LLM progressively gathers clinical details and refines its diagnostic reasoning. This multiturn structure is designed to approximate real-world patient-LLM consultations.</p><p>An example of dialogue and evaluation records between standard and cognitively biased patient agents and the LLMs is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. To ensure comparability across models, the interaction structure was standardized by instructing physician models to respond directly to the patient consultation without introductory role announcements or meta-narrative statements.</p></sec><sec id="s2-4"><title>LLMs and Temperature Settings</title><p>We evaluated 6 widely used foundation LLMs of varying capacities to assess susceptibility to patient cognitive bias in simulated consultations: GPT-4o (gpt-4o-2024-11-20), GPT-4o-Mini (gpt-4o-mini-2024-07-18), ChatGPT-3.5-Turbo (gpt-3.5-turbo-0125), Gemini 1.5 Pro (gemini-1.5-pro), Gemini 1.5 Flash (gemini-1.5-flash-002), and Gemini 1.5 Flash-8B (gemini-1.5-flash-8b). All selected models demonstrated baseline medical reasoning competence, with static benchmark accuracy exceeding 50% [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], enabling a representative comparison across different model capacities and deployment profiles.</p><p>These models span a spectrum from high-capacity general-purpose systems with strong reasoning capabilities (eg, GPT-4o, Gemini 1.5 Pro) to efficiency-optimized and lower-capacity variants (eg, GPT-4o-Mini, Gemini 1.5 Flash-8B), reflecting real-world patient-facing deployment settings. For contextual comparison within the dual-system framework, we additionally reference o1-mini, a reasoning-oriented model explicitly optimized for structured, deliberative problem-solving, which was used as the System 2 component [<xref ref-type="bibr" rid="ref24">24</xref>].</p><p>To examine the effect of output determinism on robustness to biased input, each model was evaluated at 4 decoding temperatures (T=1.0, 0.7, 0.3, and 0.0) in both static single-response evaluations and multiturn simulated consultations.</p></sec><sec id="s2-5"><title>Assessing Clinical Judgment of LLMs Through Multiturn Simulated Consultations</title><p>After 3 rounds of simulated consultation between the patient agent and the LLM, the model was instructed to generate a final diagnostic judgment based on the complete consultation record, using the instruction: &#x201C;Please provide your answer by stating only the option letter (A/B/C/D) without any explanation.&#x201D;</p><p>The final diagnostic judgment was generated after completion of the consultation, based on the full interaction record, rather than as part of the ongoing dialogue. The selected answer was then compared with the MedQA reference answer to ensure a consistent and objective evaluation of clinical judgment accuracy (see <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). In the foundation-only condition, the same model was responsible for both conducting the multiturn consultation and generating the final diagnostic judgment, following the same judgment procedure as used in the dual-system framework.</p><p>In standard patient interactions, outcomes were classified as either correct or incorrect, as no patient bias was introduced. In cognitively biased interactions, outcomes were categorized into 3 types:</p><list list-type="bullet"><list-item><p>Correct: Responses that match the reference answer in the MedQA test set.</p></list-item><list-item><p>Bias-influenced error: Errors where the model&#x2019;s response aligns with the patient&#x2019;s biases, indicating susceptibility to cognitive biases.</p></list-item><list-item><p>Other incorrect: Responses that do not match the correct answer and are unrelated to patient biases, reflecting general errors in the model&#x2019;s reasoning or understanding.</p></list-item></list></sec><sec id="s2-6"><title>Mitigation Strategy</title><p>To mitigate the impact of patient cognitive bias on diagnostic reasoning, we developed 3 prompt-engineering strategies and assessed their effectiveness within the simulated patient-LLM interaction framework. All strategies were implemented in a zero-shot setting and designed to modulate the LLM&#x2019;s role behavior during consultations and final clinical judgment.</p><list list-type="bullet"><list-item><p>Bias-aware: Instructs the model to identify potential cognitive bias in patient inputs by distinguishing subjective assertions from objective clinical facts. This enables real-time bias detection without predefined examples, supporting scalable recognition of novel bias patterns.</p></list-item><list-item><p>All-inclusive: Directs the model to &#x201C;consider all relevant medical aspects&#x201D; before responding, thereby broadening the diagnostic scope, reducing selective attention to bias-congruent information, and prioritizing evidence-based decision-making.</p></list-item><list-item><p>Step-by-step: Instructs the model to separate objective data from subjective content before making decisions, ensuring that clinical conclusions are grounded in verifiable information rather than patient-driven bias.</p></list-item></list><p>Detailed prompt formulations for each strategy are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-7"><title>Dual-System Framework</title><p>The dual-system framework is designed to emulate the complementary strengths of intuitive and analytical reasoning described in dual-process cognitive theory. In this design, the foundation LLM (System 1) conducts a 3-round interactive consultation with the simulated patient agent, efficiently eliciting the patient&#x2019;s history and generating preliminary clinical impressions. The complete consultation record&#x2014;including patient-reported details and the foundation LLM&#x2019;s intermediate responses&#x2014;is then transferred to the reasoning LLM (o1-mini, System 2), which performs deliberate, structured diagnostic reasoning to produce the final clinical judgment. Model outputs are compared with the correct MedQA reference answers to assess diagnostic accuracy. This division of labor is intended to preserve the speed and fluency of patient interaction while enhancing diagnostic robustness through targeted analytical evaluation.</p></sec><sec id="s2-8"><title>Statistical Analysis</title><p>All analyses were descriptive. Diagnostic accuracy in multiturn consultation conditions was reported as mean accuracy (SD, in %) across 3 repeated runs, with each run including all 1273 MedQA-USMLE test cases. Static single-response evaluations were performed once and are reported without SD. Bias-induced accuracy decline (BIAD) was defined as the absolute reduction in accuracy (percentage points [pp]) from standard to cognitively biased consultations. Bias-influenced error proportion (BIEP) was defined as the proportion of incorrect responses under cognitively biased consultations that aligned with the patient&#x2019;s preferred but incorrect option. For the dual-system framework, recovery of bias-induced loss (%) was calculated relative to the corresponding foundation-only baseline.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This simulation-based study used publicly available, deidentified data and involved no human participants. Therefore, ethics approval, informed consent, and participant compensation were not required. All materials are nonidentifiable.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Diagnostic Accuracy of LLMs Decreases Under Cognitively Biased Patient Consultations</title><p>Across all 6 evaluated foundation models, diagnostic accuracy declined markedly during biased consultations compared with both static single-response tests and standard multiturn consultations (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Static and standard accuracies were largely comparable within each model (difference&#x2264;6 pp), suggesting that interactive dialogue itself did not substantially affect diagnostic performance in the absence of patient bias.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Diagnostic accuracy of large language models under static, standard, and cognitively biased evaluation conditions at the default decoding temperature.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e85770_fig03.png"/></fig><p>Each panel represents one model evaluated across 3 scenarios&#x2014;static evaluation, standard interaction, and cognitively biased interaction. Colored points show mean diagnostic accuracy (%) across 3 repeated runs (1273 MedQA-USMLE cases per run). Gray connecting lines link results from the same model across scenarios, and the dashed horizontal line marks the 25% random-guess baseline. All values shown are from the default decoding temperature setting (T=1.0); full temperature analyses are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><p>Lower-capacity models showed the steepest deterioration under biased consultations. ChatGPT-3.5-Turbo declined from 64.0% (SD 0.6%) accuracy in standard consultations to 25.0% (SD 0.9%) under bias (&#x2212;39.0 pp), and Gemini 1.5 Flash-8B declined from 61.3% (SD 0.7%) to 35.5% (SD 0.6%; &#x2212;25.8 pp), with both models approaching the 25% random-guess baseline. Mid-capacity models exhibited moderate declines. Gemini 1.5 Flash decreased from 68.0% (SD 0.7%) to 48.5% (SD 0.7%; &#x2212;19.5 pp), and GPT-4o-Mini declined from 69.5% (SD 0.7%) to 47.5% (SD 0.8%; &#x2212;22.0 pp). High-capacity models were comparatively resilient. Gemini 1.5 Pro showed a smaller reduction, from 72.8% (SD 0.9%) to 65.8% (SD 0.8%; &#x2212;7.0 pp), while GPT-4o declined from 82.3% (SD 0.6%) to 73.8% (SD 0.5%; &#x2212;8.5 pp), maintaining performance well above the random-chance threshold. Specifically, in the more affected models, this corresponds to approximately 258 to 390 additional incorrect outcomes per 1000 consultations in this evaluation setting.</p><p>Collectively, these findings identify patient cognitive bias as a systemic vulnerability in LLM-mediated diagnostic reasoning: smaller architectures experience substantial performance collapse under biased inputs, whereas high-capacity systems retain partial stability.</p></sec><sec id="s3-2"><title>Temperature Reduction Fails to Mitigate Bias-Induced Accuracy Decline</title><p>Reducing decoding temperature increases output determinism and is often assumed to improve reliability in structured reasoning tasks. To test whether this strategy enhances robustness to cognitively biased patient inputs, we evaluated all 6 models at 4 decoding settings (T=1.0, 0.7, 0.3, and 0.0).</p><p>In standard consultations, temperature reduction produced modest, model-dependent gains in high-capacity models (eg, GPT-4o: +3.2 pp; Gemini 1.5 Pro: +4.7 pp), suggesting limited stabilization of diagnostic reasoning under unbiased conditions.</p><p>In biased consultations, however, lowering the temperature did not mitigate performance decline and further degraded accuracy in lower-capacity models (eg, Gemini 1.5 Flash-8B: &#x2013;13.3 pp; ChatGPT-3.5-Turbo: &#x2013;2.8 pp), approaching random-guess levels. High-capacity models showed relatively stable performance but no consistent benefit from temperature reduction (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>Overall, temperature tuning failed to alleviate bias-induced performance loss, indicating that increased determinism offers limited protection against patient cognitive bias and may exacerbate instability in lower-capacity models.</p></sec><sec id="s3-3"><title>LLMs Tend to Conform to Patient Cognitive Biases</title><p>We analyzed diagnostic error patterns during biased consultations using 2 complementary metrics. BIEP quantifies the fraction of errors aligning with a patient&#x2019;s incorrect self-diagnosis, while BIAD measures the absolute reduction in diagnostic accuracy (pp) between biased and standard consultations.</p><p>At the default decoding temperature (T=1.0), all 6 models exhibited substantial bias alignment, with BIEP values exceeding the random baseline of one-third (33.3%). The strongest conformity occurred in ChatGPT-3.5-Turbo (BIEP=81.0%), whereas Gemini 1.5 Pro showed the lowest alignment (BIEP=46.7%), indicating partial but incomplete resistance to patient bias. High-capacity models such as GPT-4o and Gemini 1.5 Pro displayed smaller BIAD values (7&#x2010;9 pp) compared with lower-capacity models such as Gemini 1.5 Flash-8B (25.8 pp) or ChatGPT-3.5-Turbo (39.0 pp).</p><p>As shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>, BIEP and BIAD were strongly positively correlated (<italic>r</italic>=0.97): models with larger bias-induced performance loss also produced a higher proportion of bias-aligned errors. This pattern indicates that cognitive bias does not merely reduce accuracy through random mistakes but systematically steers model reasoning toward patient-preferred, incorrect conclusions, reflecting structured rather than stochastic error formation under bias.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Association between bias-induced accuracy decline and bias-influenced error proportion in 6 large language models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e85770_fig04.png"/></fig><p>Scatter plot illustrating the relationship between BIAD and BIEP during cognitively biased patient consultations. Each point represents one model evaluated at the default decoding temperature (T=1.0). The <italic>y</italic>-axis is scaled from 33% upward to emphasize that all models exceeded the empirical threshold of random error alignment. Models located in the upper-right quadrant exhibit both larger accuracy declines and stronger conformity with patient-preferred but incorrect diagnoses.</p></sec><sec id="s3-4"><title>Efficacy of Prompt-Based Mitigation Strategies</title><p>We evaluated 3 prompt-based strategies&#x2014;bias-aware, all-inclusive, and step-by-step mitigation&#x2014;for their ability to reduce diagnostic performance loss during cognitively biased patient consultations (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Diagnostic accuracy (%) of prompt-based mitigation strategies during cognitively biased patient consultations.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Baseline, mean (SD)</td><td align="left" valign="bottom">Bias-aware, mean (SD); pp<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom">All-inclusive, mean (SD); pp</td><td align="left" valign="bottom">Step-by-step, mean (SD); pp</td></tr></thead><tbody><tr><td align="left" valign="top">Gemini 1.5 Pro</td><td align="left" valign="top">65.8 (0.8)</td><td align="left" valign="top">63.3 (0.4); &#x2212;2.5</td><td align="left" valign="top">69.5 (0.6); +3.7</td><td align="left" valign="top">67.8 (0.5); +2.0</td></tr><tr><td align="left" valign="top">Gemini 1.5 Flash</td><td align="left" valign="top">48.5 (0.7)</td><td align="left" valign="top">49.0 (0.4); +0.5</td><td align="left" valign="top">54.0 (0.5); +5.5</td><td align="left" valign="top">51.0 (0.6); +2.5</td></tr><tr><td align="left" valign="top">Gemini 1.5 Flash-8B</td><td align="left" valign="top">35.5 (0.6)</td><td align="left" valign="top">37.3 (0.3); +1.8</td><td align="left" valign="top">39.8 (0.8); +4.3</td><td align="left" valign="top">33.0 (0.3); &#x2212;2.5</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">73.8 (0.5)</td><td align="left" valign="top">76.0 (0.5); +2.2</td><td align="left" valign="top">78.6 (0.6); +4.8</td><td align="left" valign="top">70.9 (0.7); &#x2212;2.9</td></tr><tr><td align="left" valign="top">GPT-4o-Mini</td><td align="left" valign="top">47.5 (0.8)</td><td align="left" valign="top">45.0 (0.6); &#x2212;2.5</td><td align="left" valign="top">50.0 (0.4); +2.5</td><td align="left" valign="top">40.5 (0.7); &#x2212;7.0</td></tr><tr><td align="left" valign="top">ChatGPT-3.5-Turbo</td><td align="left" valign="top">25.0 (0.9)</td><td align="left" valign="top">32.8 (0.9); +7.8</td><td align="left" valign="top">29.6 (0.4); +4.6</td><td align="left" valign="top">19.1 (0.6); &#x2212;5.9</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>pp: percentage point.</p></fn></table-wrap-foot></table-wrap><p>The step-by-step approach, prompting models to separate objective from subjective information, showed mixed results: small gains in some high-capacity models (+2.0 pp in Gemini 1.5 Pro; +2.5 pp in Gemini 1.5 Flash) but decreases in lower-capacity models (&#x2212;2.5 pp in Gemini 1.5 Flash-8B; &#x2212;5.9 pp in ChatGPT-3.5-Turbo).</p><p>The bias-aware strategy, which instructed models to identify and adjust for patient bias, yielded modest but inconsistent improvements. GPT-4o showed a small increase (+2.2 pp), while ChatGPT-3.5-Turbo improved by +7.8 pp; other models showed minimal change.</p><p>The all-inclusive prompt, a zero-shot instruction emphasizing comprehensive differential diagnosis, produced the most consistent benefit. High-capacity models such as GPT-4o (+4.8 pp) and Gemini 1.5 Pro (+3.7 pp) recovered roughly half of their bias-induced accuracy loss, whereas smaller and mid-capacity models showed modest gains of approximately 2.5 to 5.5 pp.</p><p>Overall, prompt-based strategies provided limited mitigation of patient cognitive bias. Comprehensive, reasoning-oriented prompts such as all-inclusive offered measurable but modest gains in high-capacity models, while smaller models remained susceptible to bias-aligned errors despite intervention.</p><p>Values represent the mean diagnostic accuracy (SD, in %) across 3 repeated runs for multiturn consultations under cognitively biased conditions (n=1273 MedQA-USMLE cases, decoding temperature=1.0). Numbers in parentheses indicate the absolute change in accuracy from the biased baseline (pp) for each mitigation strategy compared with its own baseline.</p></sec><sec id="s3-5"><title>Dual-System Framework Strengthens LLM Resilience Against Patient Cognitive Bias</title><p>We evaluated a dual-system framework that integrates a foundation LLM for multiturn patient interaction (System 1) with a reasoning-oriented LLM (o1-mini, System 2) for final diagnostic judgment, assessing its ability to mitigate bias-induced performance degradation (<xref ref-type="table" rid="table2">Table 2</xref>). The dual-system framework consistently improved diagnostic accuracy across most models compared with foundation-only baselines under both standard and cognitively biased consultations; the exception was GPT-4o, for which biased-condition accuracy decreased slightly from 73.8% to 72.8%. For reference, when evaluated as a standalone model under the same multiturn protocol, o1-mini exhibited only a minimal performance decrease, from 87.2% (SD 0.8%) under standard interaction to 85.1% (SD 0.5%) under cognitively biased interaction.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Diagnostic accuracy (%) of the dual-system framework vs foundation-only baselines under standard and cognitively biased consultations.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Standard (foundation), mean (SD)</td><td align="left" valign="bottom">Standard (dual), mean (SD); pp<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="bottom">Cognitively biased (foundation), mean (SD)</td><td align="left" valign="bottom">Cognitively biased (dual), mean (SD); pp</td><td align="left" valign="bottom">Recovery of bias-induced loss (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Gemini 1.5 Pro</td><td align="left" valign="top">72.8 (0.9)</td><td align="left" valign="top">81.0 (1.0); +8.2</td><td align="left" valign="top">65.8 (0.8)</td><td align="left" valign="top">75.5 (1.1); +9.7</td><td align="left" valign="top">138.6</td></tr><tr><td align="left" valign="top">Gemini 1.5 Flash</td><td align="left" valign="top">68.0 (0.7)</td><td align="left" valign="top">78.5 (0.4); +10.5</td><td align="left" valign="top">48.5 (0.7)</td><td align="left" valign="top">67.3 (0.8); +18.8</td><td align="left" valign="top">96.4</td></tr><tr><td align="left" valign="top">Gemini 1.5 Flash-8B</td><td align="left" valign="top">61.3 (0.7)</td><td align="left" valign="top">78.3 (0.3); +17.0</td><td align="left" valign="top">35.5 (0.6)</td><td align="left" valign="top">65.8 (0.4); +30.3</td><td align="left" valign="top">117.4</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">82.3 (0.6)</td><td align="left" valign="top">85.2 (0.6); +2.9</td><td align="left" valign="top">73.8 (0.5)</td><td align="left" valign="top">72.8 (0.8); &#x2212;1.0</td><td align="left" valign="top">&#x2212;11.8</td></tr><tr><td align="left" valign="top">GPT-4o-Mini</td><td align="left" valign="top">69.5 (0.7)</td><td align="left" valign="top">77.8 (0.7); +8.3</td><td align="left" valign="top">47.5 (0.8)</td><td align="left" valign="top">67.2 (0.6); +19.7</td><td align="left" valign="top">89.5</td></tr><tr><td align="left" valign="top">ChatGPT-3.5-Turbo</td><td align="left" valign="top">64.0 (0.6)</td><td align="left" valign="top">78.5 (0.4); +14.5</td><td align="left" valign="top">25.0 (0.9)</td><td align="left" valign="top">63.7 (0.3); +38.7</td><td align="left" valign="top">99.2</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>pp: percentage point.</p></fn></table-wrap-foot></table-wrap><p>In standard consultations, accuracy increased to 77.8%&#x2010;85.2% across models, compared with 61.3%&#x2010;82.3% for foundation-only performance. The largest gains were observed in lower-capacity models, such as ChatGPT-3.5-Turbo (+14.5 pp) and Gemini 1.5 Flash-8B (+17.0 pp), where improvements mainly reflected the enhanced diagnostic reasoning provided by the System 2 model (o1-mini).</p><p>In biased consultations, the dual-system framework achieved 63.7%&#x2010;75.5% accuracy, compared with 25.0%&#x2010;73.8% for foundation-only models. Performance recovery was most pronounced in bias-prone models such as Gemini 1.5 Flash-8B (+30.3 pp) and ChatGPT-3.5-Turbo (+38.7 pp), restoring approximately 90%&#x2010;140% of the bias-induced loss.</p><p>Compared with the best-performing prompt-based mitigation (All-Inclusive), the dual-system framework delivered greater and more consistent improvements across models. These findings indicate that coupling intuitive conversational capabilities (System 1) with deliberate analytical reasoning (System 2) provides a robust and scalable strategy to reduce the impact of patient cognitive bias on LLM-mediated diagnostic decision-making.</p><p>Evaluations were conducted in multiturn simulated consultations (n=1273 MedQA-USMLE cases, temperature=1.0) using a GPT-4o-Mini&#x2013;powered patient agent. Values represent mean diagnostic accuracy (SD, in %) averaged across 3 repeated runs. Parentheses indicate the absolute change (pp) from the corresponding foundation-only baseline. Recovery of bias-induced loss (%) represents the proportion of accuracy lost under biased interaction (standard foundation &#x2212; biased foundation) that is restored by the dual-system framework; negative values indicate no recovery.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Behavioral Risk and Key Findings</title><p>Across models, cognitively biased user framing systematically redirected diagnostic reasoning toward patient-preferred but incorrect conclusions, resulting in substantial performance degradation. This effect was consistent across interactions, indicating a structured influence of user bias rather than random error.</p><p>As LLMs become increasingly used by the public for health information and preliminary medical advice, user behavior increasingly shapes the reliability and safety of AI-mediated care. In this emerging context, behavioral variability&#x2014;particularly cognitive bias in how users seek, interpret, and communicate medical information&#x2014;has become a new and underrecognized source of systemic risk. While traditional discussions of model bias have focused on technical factors such as architecture or training data [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>], our findings highlight the human side of the interaction as an equally critical determinant of reliability. By framing cognitive bias as an interaction-level risk rather than solely an individual limitation, this study underscores the need to account for human behavioral factors as integral components of AI safety in health care.</p></sec><sec id="s4-2"><title>Model Vulnerability Under Biased Interaction</title><p>Our evaluation shows that even advanced foundation LLMs, despite demonstrating strong baseline medical reasoning relative to smaller or open-source counterparts [<xref ref-type="bibr" rid="ref27">27</xref>], remain vulnerable to cognitively biased patient interactions. Under biased conditions, lower-capacity models exhibited severe performance deterioration, while elevated proportions of bias-aligned errors revealed a systematic tendency to converge on user misconceptions, reinforcing false beliefs and amplifying health-related misunderstandings.</p><p>This vulnerability is particularly concerning in light of the high level of public trust placed in AI-generated health information. Users often perceive responses from conversational AI systems as equally or more credible than advice from human clinicians [<xref ref-type="bibr" rid="ref28">28</xref>]. When such trust is coupled with bias-congruent reasoning, inaccurate recommendations may be accepted without verification, increasing the risk of delayed medical consultation, the persistence of erroneous beliefs, and unsafe self-management behaviors.</p><p>Importantly, the marked deterioration observed during biased&#x2014;but not unbiased&#x2014;consultations highlights a critical blind spot in prevailing evaluation practices. Existing benchmarks rely primarily on static, single-turn tasks and therefore overlook the behavioral and dialogic complexity of real-world consultations, failing to capture how patient framing can systematically distort reasoning and undermine diagnostic reliability [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>At the level of clinical reasoning, biased patient framing alters the trajectory and weighting of information considered during dialogue, leading to failures of effective knowledge utilization rather than deficits in underlying medical knowledge. Once an incorrect preference is explicitly introduced, agreement-oriented tendencies may further stabilize these redirected reasoning paths, contributing to the high proportion of bias-aligned errors observed. These findings are therefore better understood not as simple hallucination or general conversational drift but as distortion of diagnostic reasoning under patient-driven framing; more specifically, the present paradigm is most consistent with a self-diagnosis&#x2013;anchored framing effect, in which the patient&#x2019;s initial preferred explanation creates an anchoring point, and subsequent selective emphasis exerts confirmation-like pressure on the dialogue, potentially further amplified by sycophancy-like tendencies in aligned conversational models [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>].</p></sec><sec id="s4-3"><title>Prompting and Parameter Effects</title><p>Our findings indicate that, among the evaluated prompt-engineering approaches, only the all-inclusive strategy demonstrated consistent mitigation against patient cognitive bias across interactive clinical scenarios. By encouraging consideration of multiple diagnostic possibilities prior to judgment, the all-inclusive strategy may help reduce early anchoring on a patient-preferred explanation driven by repeated symptom emphasis. By comparison, the bias-aware strategy showed limited effectiveness, likely because biased framing is often intertwined with clinically plausible symptom descriptions, making it difficult, at the time of interaction, to clearly distinguish cognitive bias from reasonable diagnostic inference. As a zero-shot approach, the all-inclusive strategy also offers strong adaptability for real-world deployment, where models must respond effectively to diverse and previously unseen patient inputs [<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>Previous work has shown that lowering the decoding temperature in language models can improve accuracy by reducing output randomness [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Our findings refine this understanding by revealing important context-dependent trade-offs. In standard patient interactions, lower temperature settings yielded small accuracy gains in some models, likely by favoring high-probability clinical hypotheses. However, in cognitively biased interactions&#x2014;particularly for lower-capacity models&#x2014;temperature reduction increased the likelihood of bias-aligned responses, thereby amplifying diagnostic errors. One plausible explanation is that deterministic decoding encourages early commitment to a locally high-probability diagnostic hypothesis shaped by the patient&#x2019;s biased framing, reducing the model&#x2019;s ability to reconsider alternative explanations as the dialogue progresses. More broadly, temperature controls how strongly the model follows high-probability continuations conditioned on the conversation history; when the probability distribution has already been shifted by biased patient framing, lower temperatures may reinforce this anchored reasoning path. Alternative decoding strategies, such as nucleus sampling or constrained decoding, may influence bias propagation and represent directions for future research. This suggests that while deterministic decoding can stabilize model outputs, it may also entrench erroneous reasoning when the input context is biased. Optimizing temperature settings for LLM deployment in health care will therefore require context-specific calibration, balancing determinism with the flexibility needed to resist bias-driven misinformation.</p></sec><sec id="s4-4"><title>Dual-System Framework</title><p>Dual-system theory conceptualizes human cognition as operating through 2 complementary modes: a fast, intuitive System 1 and a slower, more deliberative System 2 [<xref ref-type="bibr" rid="ref18">18</xref>]. In clinical practice, these modes are reflected in the distinction between conversational history-taking and analytic diagnostic reasoning. We adopt this framework to guide the architectural design of patient-facing LLM systems, pairing a foundation model intended for rapid, fluent interaction with a reasoning-oriented LLM for diagnosis and treatment planning, consistent with prior evidence that reasoning-oriented LLMs achieve stronger performance on tasks requiring structured, multistep reasoning [<xref ref-type="bibr" rid="ref37">37</xref>].</p><p>Consistent with this conceptual alignment, our evaluation shows that such a dual-system configuration improves diagnostic performance in standard consultations and, critically, mitigates performance degradation under cognitively biased interactions. By confining deliberative reasoning to the final judgment stage while retaining the foundation LLM for upstream patient communication, the framework enhances robustness to biased input while reducing the computational and interactional burdens associated with applying deliberative reasoning throughout the entire dialogue. This benefit is most pronounced in lower-capacity systems, where additional reasoning support reduces bias-aligned errors and stabilizes performance in bias-prone scenarios. The GPT-4o exception suggests that such gains may be smaller when the foundation model already has strong internal reasoning and bias resistance, as replacing its final judgment with o1-mini provided no additional benefit under biased consultations.</p><p>In practice, while o1-mini shows stronger resistance to biased inputs, it is not optimized for fluent patient interaction and incurs higher reasoning costs, underscoring the need for a collaborative rather than single-model solution. Foundation models, by contrast, may be better suited for patient-facing tasks such as clinical history collection because of their practical advantages in speed and conversational interaction. Accordingly, the dual-system framework can be understood as a design strategy that confines deliberative reasoning to the final judgment stage, thereby balancing diagnostic robustness with practical efficiency. Importantly, this framework should be interpreted as an architectural strategy rather than a model-specific solution, and other reasoning-oriented models could potentially serve a similar System 2 role by independently evaluating consultation transcripts. From a deployment perspective, this architecture resembles a clinical workflow in which conversational AI systems collect patient information while a separate reasoning or decision-support module performs diagnostic evaluation, potentially providing a safer framework for real-world health care applications.</p><p>However, the performance of the dual-system framework remains bounded by the interaction history generated by the foundation model. Under cognitively biased conditions, elements of biased framing may persist in the dialogue transcript and constrain downstream reasoning, which may partly explain why dual-system performance does not fully match that of a standalone reasoning model. This effect appears more pronounced when lower-capacity foundation models are used, likely because they reinforce biased narratives more strongly during history taking.</p></sec><sec id="s4-5"><title>Input Guidance and Feedback</title><p>LLMs are highly sensitive to user input framing, a property that increases their vulnerability to biased or incomplete patient narratives in medical settings. Guiding patients toward more effective input strategies&#x2014;such as structured symptom checklists or guided question prompts&#x2014;may improve the reliability of LLM-generated recommendations and reduce variability in patient-LLM interactions. More broadly, standardized interaction protocols that minimize biased framing could support more consistent and trustworthy communication in patient-facing applications.</p><p>This input-output dynamic also has implications for model refinement. Reinforcement Learning from Human Feedback is widely used to align LLM responses with human preferences and desired outcomes [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. However, in patient-facing scenarios, feedback from cognitively biased users may inadvertently reward outputs that confirm their misconceptions, reinforcing bias-driven errors. At the current stage of AI deployment in health care, professional supervision and expert feedback&#x2014;rather than unfiltered patient feedback&#x2014;should guide model optimization, particularly for high-stakes clinical tasks [<xref ref-type="bibr" rid="ref40">40</xref>].</p></sec><sec id="s4-6"><title>Limitations and Future Work</title><p>This study focuses on patient-facing interactions and therefore evaluates widely used commercial LLMs that currently dominate real-world patient access. Nevertheless, extending the proposed framework to open-source and locally deployable models remains an important direction for future work.</p><p>The simulated patient used verbatim medical terminology (eg, &#x201C;a bulging disc impinging on a lumbar spinal nerve&#x201D;), which may not fully reflect the colloquial nature of real-world patient interactions. This design choice was made for diagnostic clarity, but it may limit the ecological validity of the interactions.</p><p>Patient cognitive bias in this study was operationalized as an explicit self-diagnosis framing condition, in which the simulated patient adopted a preferred but incorrect explanation from the outset and then displayed anchoring and confirmation-like behavior throughout the consultation. This should be distinguished from more common real-world patient narratives, in which bias may be subtler and unintentional, for example through selective symptom emphasis, omission, or framing without a firm self-diagnosis. Accordingly, our design represents a relatively high-intensity and stable bias condition intended to provide a standardized stress test of model robustness and may therefore overestimate the magnitude of bias-induced misjudgment relative to many routine patient-AI interactions. In real consultations, patient cognitive bias likely exists along a spectrum, ranging from mild framing effects to persistent self-diagnosis anchoring, and may be intermittent, inconsistent, or evolve during the interaction rather than remain stable throughout. From a human-factors perspective, the present design was intended to model a clinically recognizable higher-intensity bias pattern in which patients become anchored to a specific explanation and selectively foreground supporting information. Therefore, the findings should be interpreted as most directly applicable to stronger self-diagnosis&#x2013;driven bias scenarios rather than to all forms of patient cognitive bias.</p><p>The inherent limitations of simulation in this study meant that certain physician-like strategies for addressing patient cognitive bias could not be fully evaluated. In real consultations, physicians may use open-ended questions to encourage patient self-reflection or elicit additional critical symptoms, signs, or test results [<xref ref-type="bibr" rid="ref41">41</xref>]. In our simulations, however, patient responses were restricted to predefined clinical data and their associated biases to avoid introducing uncontrolled experimental variables, limiting the scope for such interactive techniques.</p><p>The Few-Shot mitigation condition was excluded from quantitative analysis because its prompt design was not aligned with the evaluation requirement for forced final-option selection.</p><p>When evaluating the dual-system framework, foundation LLMs were prompted using only basic physician role instructions. Incorporating alternative prompting strategies&#x2014;such as the all-inclusive approach, which encourages consideration of all relevant medical aspects before forming a diagnosis&#x2014;may further enhance the framework&#x2019;s performance and robustness against patient cognitive bias. Moreover, as this study was conducted in a standardized simulated consultation environment, the interactions may not fully represent natural clinical conversations, which may limit ecological validity. In addition, only a single generative patient agent was used, and variability across different patient agents was not examined, which may limit generalizability.</p><p>Because MedQA-USMLE is publicly available, commercial LLMs may have been exposed to some benchmark items during pretraining or posttraining, which could inflate absolute accuracy estimates and affect the observed magnitude of performance decline. Accordingly, our results should be interpreted as within-model relative comparisons under biased interaction rather than uncontaminated estimates of absolute diagnostic competence.</p></sec><sec id="s4-7"><title>Conclusions</title><p>This study identifies patient-driven cognitive bias as a behavioral risk that compromises the reliability of LLMs in health consultations. Across 6 contemporary LLMs, biased user input led to substantial degradation in diagnostic accuracy, particularly in lower-capacity systems. Common mitigation approaches such as prompt engineering or temperature adjustment offered limited protection. In contrast, a dual-system framework&#x2014;combining conversational and reasoning LLMs&#x2014;restored most of the performance lost under bias and provided a scalable design for safer, bias-aware medical AI. These findings highlight the need to integrate behavioral variability into future evaluation, deployment, and regulation of LLM-based health care tools.</p></sec></sec></body><back><ack><p>The authors thank all colleagues who provided general feedback during the development of this study. Generative artificial intelligence tools were used solely for language editing and stylistic refinement of the manuscript. The authors take full responsibility for the accuracy and integrity of the content. We release the code for running our dialog simulation system and the code for testing and evaluation as a public GitHub repository [<xref ref-type="bibr" rid="ref42">42</xref>].</p></ack><notes><sec><title>Funding</title><p>This work was supported by the Young Researcher Fund Project of the Ministry of Education Foundation on Humanities and Social Sciences under grant 24YJC880135, the Hunan Provincial Natural Science Foundation of China under grant 2026JJ50060, and the Excellent Youth Scientific Research Project of the Hunan Provincial Department of Education under Grant 23B0286. The funding sources played no role in the design, implementation, data analysis, interpretation, or reporting of this study.</p></sec><sec><title>Data Availability</title><p>The dataset used for testing, the roles, and the mitigation strategies' prompt dataset and results can be found in our project GitHub repository [<xref ref-type="bibr" rid="ref42">42</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>YZ collected the data and developed the simulation program. QW conducted model testing and performed the statistical analysis. SW designed the study, performed statistical analysis, revised the manuscript, and provided overall supervision, organizational support, and conceptual guidance. All authors reviewed and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BIAD</term><def><p>bias-induced accuracy decline</p></def></def-item><def-item><term id="abb3">BIEP</term><def><p>bias-influenced error proportion</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">MedQA</term><def><p>medical question answering dataset</p></def></def-item><def-item><term id="abb6">pp</term><def><p>percentage points</p></def></def-item><def-item><term id="abb7">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>KFF health misinformation tracking poll: artificial intelligence and health information</article-title><source>Kaiser Family Foundation</source><year>2024</year><access-date>2026-05-28</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.kff.org/public-opinion/kff-health-misinformation-tracking-poll-artificial-intelligence-and-health-information/">https://www.kff.org/public-opinion/kff-health-misinformation-tracking-poll-artificial-intelligence-and-health-information/</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayre</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cvejic</surname><given-names>E</given-names> </name><name name-style="western"><surname>McCaffery</surname><given-names>KJ</given-names> </name></person-group><article-title>Use of ChatGPT to obtain health information in Australia, 2024: insights from a nationally representative survey</article-title><source>Med J Aust</source><year>2025</year><month>03</month><day>3</day><volume>222</volume><issue>4</issue><fpage>210</fpage><lpage>212</lpage><pub-id pub-id-type="doi">10.5694/mja2.52598</pub-id><pub-id pub-id-type="medline">39901778</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayo-Ajibola</surname><given-names>O</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Riddell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kravitz</surname><given-names>RL</given-names> </name></person-group><article-title>Characterizing the adoption and experiences of users of artificial intelligence-generated health information in the United States: cross-sectional questionnaire study</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>14</day><volume>26</volume><fpage>e55138</fpage><pub-id pub-id-type="doi">10.2196/55138</pub-id><pub-id pub-id-type="medline">39141910</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mansour</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Fatouh</surname><given-names>AH</given-names> </name></person-group><article-title>Measurement of bias in the contents of web search for health information retrieval</article-title><source>J Scientometric Res</source><year>2023</year><month>11</month><day>30</day><volume>12</volume><issue>3</issue><fpage>621</fpage><lpage>630</lpage><pub-id pub-id-type="doi">10.5530/jscires.12.3.060</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suzuki</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yamamoto</surname><given-names>Y</given-names> </name></person-group><article-title>Characterizing the influence of confirmation bias on web search behavior</article-title><source>Front Psychol</source><year>2021</year><volume>12</volume><fpage>771948</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2021.771948</pub-id><pub-id pub-id-type="medline">34938242</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pines</surname><given-names>JM</given-names> </name></person-group><article-title>Profiles in patient safety: confirmation bias in emergency medicine</article-title><source>Acad Emergency Med</source><year>2006</year><month>01</month><volume>13</volume><issue>1</issue><fpage>90</fpage><lpage>94</lpage><pub-id pub-id-type="doi">10.1111/j.1553-2712.2006.tb00990.x</pub-id><pub-id pub-id-type="medline">16365325</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kusa</surname><given-names>W</given-names> </name><name name-style="western"><surname>Mosca</surname><given-names>E</given-names> </name><name name-style="western"><surname>Lipani</surname><given-names>A</given-names> </name></person-group><article-title>&#x201C;Dr LLM, what do i have?&#x201D;: the impact of user beliefs and prompt formulation on health diagnoses</article-title><conf-name>Proceedings of the Third Workshop on NLP for Medical Conversations</conf-name><conf-date>Nov 1-4, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.nlpmc-1">https://aclanthology.org/2023.nlpmc-1</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.nlpmc-1.2</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Loya</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sinha</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Futrell</surname><given-names>R</given-names> </name></person-group><article-title>Exploring the sensitivity of LLMs&#x2019; decision-making capabilities: insights from prompt variations and hyperparameters</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Dec 6-10, 2023</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2023.findings-emnlp">https://aclanthology.org/2023.findings-emnlp</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2023.findings-emnlp.241</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Benary</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>XD</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Leveraging large language models for decision support in personalized oncology</article-title><source>JAMA Netw Open</source><year>2023</year><month>11</month><day>1</day><volume>6</volume><issue>11</issue><fpage>e2343689</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.43689</pub-id><pub-id pub-id-type="medline">37976064</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaretsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Baskharoun</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Generative artificial intelligence to transform inpatient discharge summaries to patient-friendly language and format</article-title><source>JAMA Netw Open</source><year>2024</year><month>03</month><day>4</day><volume>7</volume><issue>3</issue><fpage>e240357</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.0357</pub-id><pub-id pub-id-type="medline">38466307</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Strika</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Petkovic</surname><given-names>K</given-names> </name><name name-style="western"><surname>Likic</surname><given-names>R</given-names> </name><name name-style="western"><surname>Batenburg</surname><given-names>R</given-names> </name></person-group><article-title>Bridging healthcare gaps: a scoping review on the role of artificial intelligence, deep learning, and large language models in alleviating problems in medical deserts</article-title><source>Postgrad Med J</source><year>2024</year><month>12</month><day>23</day><volume>101</volume><issue>1191</issue><fpage>4</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1093/postmj/qgae122</pub-id><pub-id pub-id-type="medline">39323384</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rutledge</surname><given-names>GW</given-names> </name><name name-style="western"><surname>Sivura</surname><given-names>A</given-names> </name></person-group><article-title>A generative AI&#x2013;based virtual physician assistant</article-title><source>Proc AAAI Symp Ser</source><year>2024</year><volume>3</volume><issue>1</issue><fpage>64</fpage><lpage>65</lpage><pub-id pub-id-type="doi">10.1609/aaaiss.v3i1.31182</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wen</surname><given-names>B</given-names> </name><name name-style="western"><surname>Norel</surname><given-names>R</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Stappenbeck</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zulkernine</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>H</given-names> </name></person-group><article-title>Leveraging large language models for patient engagement: the power of conversational AI in digital health</article-title><conf-name>2024 IEEE International Conference on Digital Health (ICDH)</conf-name><conf-date>Jul 7-13, 2024</conf-date><pub-id pub-id-type="doi">10.1109/ICDH62654.2024.00027</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kopka</surname><given-names>M</given-names> </name><name name-style="western"><surname>von Kalckreuth</surname><given-names>N</given-names> </name><name name-style="western"><surname>Feufel</surname><given-names>MA</given-names> </name></person-group><article-title>Accuracy of online symptom assessment applications, large language models, and laypeople for self-triage decisions</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>25</day><volume>8</volume><issue>1</issue><fpage>178</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01566-6</pub-id><pub-id pub-id-type="medline">40133390</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reddy</surname><given-names>S</given-names> </name></person-group><article-title>Global harmonization of artificial intelligence-enabled software as a medical device regulation: addressing challenges and unifying standards</article-title><source>Mayo Clin Proc Digit Health</source><year>2025</year><month>03</month><volume>3</volume><issue>1</issue><fpage>100191</fpage><pub-id pub-id-type="doi">10.1016/j.mcpdig.2024.100191</pub-id><pub-id pub-id-type="medline">40207007</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="report"><article-title>Ethics and governance of artificial intelligence for health: guidance on large multimodal models</article-title><year>2024</year><access-date>2026-05-28</access-date><publisher-name>World Health Organization</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://iris.who.int/server/api/core/bitstreams/e9e62c65-6045-481e-bd04-20e206bc5039/content">https://iris.who.int/server/api/core/bitstreams/e9e62c65-6045-481e-bd04-20e206bc5039/content</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kahneman</surname><given-names>D</given-names> </name></person-group><source>Thinking, Fast and Slow</source><year>2011</year><publisher-name>Penguin Books</publisher-name><pub-id pub-id-type="other">978-0141033570</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Qin</surname><given-names>X</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>J</given-names> </name></person-group><article-title>AITurk: using chatgpt for social science research</article-title><source>SSRN</source><comment>Preprint posted online on  Jun 7, 2024</comment><pub-id pub-id-type="doi">10.2139/ssrn.4922861</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Park</surname><given-names>JS</given-names> </name><name name-style="western"><surname>O&#x2019;Brien</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Morris</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bernstein</surname><given-names>MS</given-names> </name></person-group><article-title>Generative agents: interactive simulacra of human behavior</article-title><conf-name>ACM Symposium on User Interface Software and Technology (UIST 2023)</conf-name><conf-date>Oct 29 to Nov 1, 2023</conf-date><pub-id pub-id-type="doi">10.1145/3586183.3606763</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pressman</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Borna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gomez-Cabello</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Haider</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Haider</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Forte</surname><given-names>AJ</given-names> </name></person-group><article-title>Clinical and surgical applications of large language models: a systematic review</article-title><source>J Clin Med</source><year>2024</year><month>05</month><day>22</day><volume>13</volume><issue>11</issue><fpage>3041</fpage><pub-id pub-id-type="doi">10.3390/jcm13113041</pub-id><pub-id pub-id-type="medline">38892752</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carl&#x00E0;</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Gambini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Baldascino</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Large language models as assistance for glaucoma surgical cases: a ChatGPT vs. Google Gemini comparison</article-title><source>Graefes Arch Clin Exp Ophthalmol</source><year>2024</year><month>09</month><volume>262</volume><issue>9</issue><fpage>2945</fpage><lpage>2959</lpage><pub-id pub-id-type="doi">10.1007/s00417-024-06470-5</pub-id><pub-id pub-id-type="medline">38573349</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carl&#x00E0;</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Gambini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Baldascino</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Exploring AI-chatbots&#x2019; capability to suggest surgical planning in ophthalmology: ChatGPT versus Google Gemini analysis of retinal detachment cases</article-title><source>Br J Ophthalmol</source><year>2024</year><month>09</month><day>20</day><volume>108</volume><issue>10</issue><fpage>1457</fpage><lpage>1469</lpage><pub-id pub-id-type="doi">10.1136/bjo-2023-325143</pub-id><pub-id pub-id-type="medline">38448201</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Du</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>A comparative study on reasoning patterns of OpenAI&#x2019;s o1 model</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 17, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.13639</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Du</surname><given-names>X</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name></person-group><article-title>A survey on fairness in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2308.10149</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pressman</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Borna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gomez-Cabello</surname><given-names>CA</given-names> </name><name name-style="western"><surname>Haider</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Haider</surname><given-names>C</given-names> </name><name name-style="western"><surname>Forte</surname><given-names>AJ</given-names> </name></person-group><article-title>AI and ethics: a systematic review of the ethical considerations of large language model use in surgery research</article-title><source>Healthcare (Basel)</source><year>2024</year><month>04</month><day>13</day><volume>12</volume><issue>8</issue><fpage>825</fpage><pub-id pub-id-type="doi">10.3390/healthcare12080825</pub-id><pub-id pub-id-type="medline">38667587</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmidgall</surname><given-names>S</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>C</given-names> </name><name name-style="western"><surname>Essien</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of cognitive biases in medical language models</article-title><source>NPJ Digit Med</source><year>2024</year><month>10</month><day>21</day><volume>7</volume><issue>1</issue><fpage>295</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01283-6</pub-id><pub-id pub-id-type="medline">39433945</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Leslie-Miller</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Simon</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Dean</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mokhallati</surname><given-names>N</given-names> </name><name name-style="western"><surname>Cushing</surname><given-names>CC</given-names> </name></person-group><article-title>The critical need for expert oversight of ChatGPT: prompt engineering for safeguarding child healthcare information</article-title><source>J Pediatr Psychol</source><year>2024</year><month>11</month><day>1</day><volume>49</volume><issue>11</issue><fpage>812</fpage><lpage>817</lpage><pub-id pub-id-type="doi">10.1093/jpepsy/jsae075</pub-id><pub-id pub-id-type="medline">39271174</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><issue>1</issue><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Herrmann-Werner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Festl-Wietek</surname><given-names>T</given-names> </name><name name-style="western"><surname>Holderried</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Assessing ChatGPT's mastery of Bloom's taxonomy using psychosomatic medicine exam questions: mixed-methods study</article-title><source>J Med Internet Res</source><year>2024</year><month>01</month><day>23</day><volume>26</volume><fpage>e52113</fpage><pub-id pub-id-type="doi">10.2196/52113</pub-id><pub-id pub-id-type="medline">38261378</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Sharma</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Korbak</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Towards understanding sycophancy in language models</article-title><access-date>2026-05-28</access-date><conf-name>The Twelfth International Conference on Learning Representations</conf-name><conf-date>May 7-11, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/forum?id=tvhaxkMKAn">https://openreview.net/forum?id=tvhaxkMKAn</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ly</surname><given-names>DP</given-names> </name><name name-style="western"><surname>Shekelle</surname><given-names>PG</given-names> </name><name name-style="western"><surname>Song</surname><given-names>Z</given-names> </name></person-group><article-title>Evidence for anchoring bias during physician decision-making</article-title><source>JAMA Intern Med</source><year>2023</year><month>08</month><day>1</day><volume>183</volume><issue>8</issue><fpage>818</fpage><lpage>823</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.2366</pub-id><pub-id pub-id-type="medline">37358843</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nendaz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Perrier</surname><given-names>A</given-names> </name></person-group><article-title>Diagnostic errors and flaws in clinical reasoning: mechanisms and prevention in practice</article-title><source>Swiss Med Wkly</source><year>2012</year><volume>142</volume><fpage>w13706</fpage><pub-id pub-id-type="doi">10.4414/smw.2012.13706</pub-id><pub-id pub-id-type="medline">23135902</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xian</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lampert</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Schiele</surname><given-names>B</given-names> </name><name name-style="western"><surname>Akata</surname><given-names>Z</given-names> </name></person-group><article-title>Zero-shot learning&#x2014;a comprehensive evaluation of the good, the bad and the ugly</article-title><source>IEEE Trans Pattern Anal Mach Intell</source><year>2019</year><month>09</month><volume>41</volume><issue>9</issue><fpage>2251</fpage><lpage>2265</lpage><pub-id pub-id-type="doi">10.1109/TPAMI.2018.2857768</pub-id><pub-id pub-id-type="medline">30028691</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id><pub-id pub-id-type="medline">39779926</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Van Koevering</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kleinberg</surname><given-names>J</given-names> </name></person-group><article-title>How random is random? Evaluating the randomness and humanness of LLMs&#x2019; coin flips</article-title><source>arXiv</source><comment>Preprint posted online on  May 31, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.00092</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>ZZ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>ML</given-names> </name><etal/></person-group><article-title>From system 1 to system 2: a survey of reasoning large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 24, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2502.17419</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giuffr&#x00E8;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kresevic</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pugliese</surname><given-names>N</given-names> </name><name name-style="western"><surname>You</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name></person-group><article-title>Optimizing large language models in digestive disease: strategies and challenges to improve clinical outcomes</article-title><source>Liver Int</source><year>2024</year><month>09</month><volume>44</volume><issue>9</issue><fpage>2114</fpage><lpage>2124</lpage><pub-id pub-id-type="doi">10.1111/liv.15974</pub-id><pub-id pub-id-type="medline">38819632</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rahman</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Irbaz</surname><given-names>MS</given-names> </name><name name-style="western"><surname>North</surname><given-names>K</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Zampieri</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lybarger</surname><given-names>K</given-names> </name></person-group><article-title>Health text simplification: an annotated corpus for digestive cancer education and novel strategies for reinforcement learning</article-title><source>J Biomed Inform</source><year>2024</year><month>10</month><volume>158</volume><fpage>104727</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2024.104727</pub-id><pub-id pub-id-type="medline">39293643</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mirzaei</surname><given-names>T</given-names> </name><name name-style="western"><surname>Amini</surname><given-names>L</given-names> </name><name name-style="western"><surname>Esmaeilzadeh</surname><given-names>P</given-names> </name></person-group><article-title>Clinician voices on ethics of LLM integration in healthcare: a thematic analysis of ethical concerns and implications</article-title><source>BMC Med Inform Decis Mak</source><year>2024</year><month>09</month><day>9</day><volume>24</volume><issue>1</issue><fpage>250</fpage><pub-id pub-id-type="doi">10.1186/s12911-024-02656-3</pub-id><pub-id pub-id-type="medline">39252056</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="report"><person-group person-group-type="author"><name name-style="western"><surname>Garrubba</surname><given-names>M</given-names> </name><name name-style="western"><surname>Joseph</surname><given-names>C</given-names> </name><name name-style="western"><surname>Melder</surname><given-names>A</given-names> </name></person-group><article-title>Best practice to identify and prevent cognitive bias in clinical decision-making: scoping review</article-title><year>2019</year><access-date>2026-05-28</access-date><publisher-name>Monash Health</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://monashhealth.org/wp-content/uploads/2020/03/Cognitive-Bias_Scoping-Review_2019_FINAL.pdf">https://monashhealth.org/wp-content/uploads/2020/03/Cognitive-Bias_Scoping-Review_2019_FINAL.pdf</ext-link></comment></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="web"><article-title>GitHub</article-title><source>Sharon202588/biased-research-llm</source><access-date>2026-05-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/sharon202588/biased-research-llm.git">https://github.com/sharon202588/biased-research-llm.git</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts for simulated patients, medical large language models, and mitigation strategies.</p><media xlink:href="jmir_v28i1e85770_app1.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Records of consultations with standard patients and cognitively biased patients in large language model&#x2013;supported health consultations.</p><media xlink:href="jmir_v28i1e85770_app2.docx" xlink:title="DOCX File, 41 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Evaluation of foundation and reasoning large language model diagnostic performance under standard and cognitively biased multiturn consultations.</p><media xlink:href="jmir_v28i1e85770_app3.png" xlink:title="PNG File, 527 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Effect of decoding temperature on diagnostic accuracy under 3 evaluation conditions.</p><media xlink:href="jmir_v28i1e85770_app4.docx" xlink:title="DOCX File, 31 KB"/></supplementary-material></app-group></back></article>