<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e90061</article-id><article-id pub-id-type="doi">10.2196/90061</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Benchmark Integrity and Reasoning-Trace Errors in Medical Question Answering With Large Language Models: Mixed Methods Study With Sparse Autoencoders</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Liu</surname><given-names>Jialin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Liu</surname><given-names>Siru</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wright</surname><given-names>Adam</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Medical Informatics, West China Hospital of Sichuan University</institution><addr-line>Chengdu</addr-line><addr-line>Sichuan</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Otolaryngology-Head and Neck Surgery, West China Hospital of Sichuan University</institution><addr-line>Chengdu</addr-line><addr-line>Sichuan</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Biomedical Informatics, Vanderbilt University Medical Center</institution><addr-line>2525 West End Ave</addr-line><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Computer Science, Vanderbilt University</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Department of Medicine, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wu</surname><given-names>Chaochen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ma</surname><given-names>Chunwei</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Bodagala</surname><given-names>Guru Lakshmi Priyanka</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Kurmashev</surname><given-names>Ruslan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Siru Liu, PhD, Department of Biomedical Informatics, Vanderbilt University Medical Center, 2525 West End Ave, Nashville, TN, 37203, United States, 1 615 936 6867; <email>siru.liu@vumc.org</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>12</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e90061</elocation-id><history><date date-type="received"><day>20</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>15</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>18</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Jialin Liu, Siru Liu, Adam Wright. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 12.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e90061"/><abstract><sec><title>Background</title><p>Large language models (LLMs) show promise for enhancing diagnostic accuracy and clinical decision-making. However, prevailing evaluations rely on examination-based benchmarks such as MedQA. Furthermore, the internal mechanisms driving both correct and incorrect reasoning in LLMs remain poorly understood, limiting opportunities for targeted improvement.</p></sec><sec><title>Objective</title><p>This study aimed to investigate failure modes of reasoning-based LLMs in medicine by (1) auditing the integrity of the MedQA benchmark, (2) developing a clinically informed taxonomy of reasoning errors across multiple major LLMs, and (3) testing a mechanistic intervention using sparse autoencoders (SAEs) to modulate reasoning characteristics and improve accuracy in medical question answering benchmarks.</p></sec><sec sec-type="methods"><title>Methods</title><p>We evaluated OpenAI o1 on the MedQA and cross-referenced incorrect answers against original source platforms to identify benchmark flaws including missing figures and postrelease ambiguity corrections. For the 37 confirmed model failures remaining after exclusion of flawed items, we developed a reasoning error taxonomy through iterative inductive coding by 2 independent reviewers (JL and SL) and validated it on three major LLMs (ie, OpenAI GPT-4.5, OpenAI o3-mini, and DeepSeek-R1). We then trained an SAE on the DeepSeek-R1-Distill-Llama-8B model using MedQA-derived reasoning traces. Reasoning-specific features were identified using ReasonScore and subjected to activation steering at 2 strengths. Model accuracy, reasoning trace length, and hallucination metrics were measured across MedQA, MedMCQA, and PubMedQA. Hallucinations were evaluated using an LLM-as-a-judge (OpenAI GPT-5-mini) and validated on a stratified manual sample of 100 claims.</p></sec><sec sec-type="results"><title>Results</title><p>Forty-one percent of initial OpenAI o1 errors reflected benchmark problems, including missing figures (22%) and ambiguities subsequently corrected on the source platforms (19%). Neither OpenAI o1 nor OpenAI o3-mini explicitly flagged these flawed items, while GPT-5.2 identified a small subset, suggesting that question-integrity recognition remains limited and model-dependent. Among the 37 confirmed errors, our taxonomy classified failures into four categories: Information Synthesis Errors, Therapeutic Decision Errors, Diagnostic Reasoning Errors, and Foundational Principle Errors. Activation steering of reasoning-specific SAE features improved accuracy on MedQA and PubMedQA, with a consistent positive trend on MedMCQA. The greatest gains were observed at steering strength 2 (MedQA: 0.568-0.597 and PubMedQA: 0.708-0.739). Steering also increased reasoning-trace length substantially, with no significant correlation between verbosity and accuracy. Five functional feature categories were identified, with alignments to the error taxonomy.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>These findings reveal two distinct sources of unreliability in medical LLM evaluation: benchmark-level integrity gaps that misattribute model failure and recurrent model-level reasoning patterns potentially amenable to mechanistic correction. Notably, the benchmark issues identified here do not reflect static flaws in the original source platforms, which have since corrected many problematic items, but rather a failure to propagate those corrections to derived benchmarks. The alignment between SAE-identified feature categories and the error taxonomy further suggests that reasoning failures reflect structured internal processes that can be targeted at the feature level.</p></sec></abstract><kwd-group><kwd>large language model</kwd><kwd>medical question answering</kwd><kwd>sparse autoencoders</kwd><kwd>benchmark evaluation</kwd><kwd>reasoning errors</kwd><kwd>clinical decision support</kwd><kwd>mechanistic interpretability</kwd><kwd>activation steering</kwd><kwd>hallucination</kwd><kwd>artificial intelligence in medicine</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The integration of artificial intelligence (AI) into medicine presents a profound opportunity to enhance diagnostic accuracy and clinical decision-making. At the forefront of this transformation are large language models (LLMs), which, after training on vast corpora of medical literature and clinical guidelines, have demonstrated remarkable capabilities [<xref ref-type="bibr" rid="ref1">1</xref>]. These models show promise in diverse applications, from critiquing clinical decision support systems to drafting patient communications and summarizing clinical notes [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Recent LLM releases differ less in core architecture than in training incentives and inference-time policies that govern how much computation a model allocates before producing an answer [<xref ref-type="bibr" rid="ref7">7</xref>]. Reasoning models are optimized (eg, via reinforcement learning) to perform better on multistep problems and often generate longer intermediate rationales, whereas general-purpose chat models may produce shorter justifications by default. Importantly, generated rationales are not guaranteed to be faithful readouts of causal reasoning [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>However, the prevailing methods for evaluating these advanced models remain a critical weak point. Current assessments predominantly rely on examination-based benchmarks, with a mere 5% using real patient data [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. MedQA, a widely used benchmark derived from the United States Medical Licensing Examination (USMLE), exemplifies this issue [<xref ref-type="bibr" rid="ref11">11</xref>]. Developed in 2020 from questions scraped from public websites, its multiple-choice format often tests rote memorization over nuanced clinical judgment [<xref ref-type="bibr" rid="ref12">12</xref>]. While some have questioned the validity of using examination questions to evaluate clinical AI [<xref ref-type="bibr" rid="ref13">13</xref>], there has been little systematic investigation into the intrinsic quality and reliability of the benchmark questions themselves. Prior work has shown that LLM performance drops substantially when multiple-choice shortcuts are disrupted, further underscoring the need to examine benchmark quality beyond surface-level accuracy [<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>A crucial factor in the performance of reasoning LLMs is their explicit, stepwise thought process, often refined through reinforcement learning. This can be optimized using outcome-based rewards, which assess only the final answer&#x2019;s correctness [<xref ref-type="bibr" rid="ref15">15</xref>], or process-based rewards, which evaluate the entire reasoning sequence [<xref ref-type="bibr" rid="ref16">16</xref>]. In deterministic fields such as mathematics, outcome-based rewards suffice. In medicine, however, the clinical logic must be as sound as the conclusion, making the analysis of reasoning errors essential for developing safe and effective clinical LLMs. While reasoning errors in LLMs have been studied in general domains, clinically grounded taxonomies of reasoning failures across multiple frontier models remain limited. Recent work in mechanistic interpretability has demonstrated that sparse autoencoders (SAEs) can identify and steer interpretable features in LLMs [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], but this approach has not yet been applied to medical reasoning.</p><p>We address 3 linked gaps in evaluating and improving medical question-answering (QA) LLMs. First, we audit MedQA test-set integrity by reconciling incorrect model outputs with the original source platforms to identify missing modalities and postrelease corrections. Second, focusing on questions that remain well-specified, we develop an initial clinically informed taxonomy of observable reasoning-trace failures and examine how its distribution varies across several frontier models. Third, we test whether a mechanistic intervention, steering SAE features enriched around reasoning-trace tokens, can measurably shift accuracy and reasoning-trace properties across multiple medical QA datasets.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Reasoning Error Analysis and Taxonomy Development</title><p>We selected the MedQA test dataset (N=1273) of USMLE-style questions as our primary benchmark. We used OpenAI o1, a representative state-of-the-art reasoning model, to generate answers and a corresponding chain-of-thought reasoning process for each question. LLM prompts for answer and reasoning generation are listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. We identified incorrectly answered questions by comparing the model&#x2019;s output with the provided answer key.</p><p>For each incorrect answer, we located the original question in its source examination bank (Medbullets, AMBOSS, or Lecturio). This allowed us to identify discrepancies, such as missing figures or subsequent question updates made by the source platforms to correct ambiguities. During reconciliation with source platforms, we also tracked whether models explicitly signaled uncertainty or unanswerability, such as noting missing information or required figures, or identifying ambiguity in the question stem. To assess whether newer models demonstrate improved question-integrity recognition, we additionally tested GPT-5.2-pro-2025-12-11 on the identified flawed items post hoc. This source reconciliation was feasible for MedQA because its questions could be traced to actively maintained platforms. Analogous auditing was not performed for MedMCQA or PubMedQA, as MedMCQA was compiled from various open websites and books without a single maintained source platform, and PubMedQA derives questions and answers from PubMed abstracts rather than from a curated examination bank.</p><p>After excluding questions with these external quality issues, we developed an error taxonomy using an inductive coding approach [<xref ref-type="bibr" rid="ref19">19</xref>]. Two coders (SL and JL) independently analyzed the reasoning processes for a subset of 5 incorrect answers, using open coding to identify error themes. The coders then met to discuss their findings and create a consensus-based coding guideline. This iterative process was repeated with new batches of 5 questions until all reasoning traces for the incorrectly answered questions were coded.</p><p>To validate the taxonomy, we assessed the performance of other advanced LLMs (OpenAI GPT-4.5, OpenAI o3-mini, and DeepSeek-R1) on the same set of questions. For models that do not natively output their reasoning, we used a step-by-step chain-of-thought prompt to elicit it. All models were accessed via their respective application programming interfaces (APIs). The specific API versions queried were OpenAI o1 (o1-preview, queried January 11, 2025), OpenAI o3-mini (o3-mini-2025-01-31, queried March 25, 2025), GPT-4.5 (gpt-4.5-preview-2025-02-27, queried March 25, 2025), and DeepSeek-R1 (queried March 25, 2025). Throughout the remainder of the paper, these models are referred to as o1, o3-mini, GPT-4.5, and DeepSeek-R1, respectively. Each question was evaluated in a single-shot setting with temperature set to 0. Other decoding parameters (top-p, seed, max tokens) were not explicitly set and followed the provider default. We then applied our taxonomy to classify the failure modes in each model&#x2019;s incorrect reasoning traces and manually compared reasoning traces from different models. Notably, DeepSeek-R1 was deliberately included as one of the validation models because the SAE analysis model (DeepSeek-R1-Distill-Llama-8B) was derived from it via knowledge distillation.</p></sec><sec id="s2-2"><title>SAE Development and Feature Steering</title><p>To further interpret the reasoning process quantitatively, we used an SAE, a method to decompose an LLM&#x2019;s internal activations into a sparse set of interpretable features [<xref ref-type="bibr" rid="ref17">17</xref>]. The core idea is that a neural network&#x2019;s internal representations are dense and difficult to interpret directly, as individual neurons often respond to multiple unrelated concepts (polysemanticity). An SAE addresses this by learning to reconstruct the model&#x2019;s activations through a higher-dimensional but sparse intermediate representation, where each dimension (or feature) ideally corresponds to a single interpretable concept. SAEs were chosen over other interpretability methods (eg, attention analysis and probing classifiers) because they enable both interpretation and intervention: once reasoning-relevant features are identified, their activations can be directly modified to test effects on model behavior.</p><p>We trained an SAE using the DeepSeek-R1-Distill-Llama-8B model. This model was chosen for its open-source availability, strong reasoning capabilities, and moderate size, which balanced performance with experimental efficiency. The training data included general conversation logs (LMSys-Chat-1M) [<xref ref-type="bibr" rid="ref20">20</xref>], general reasoning traces (OpenThoughts-114k) [<xref ref-type="bibr" rid="ref21">21</xref>], and reasoning traces leading to correct answers in the training dataset of MedQA generated by DeepSeek-R1. Notably, the MedQA training and testing sets are different, and the SAE evaluation was conducted exclusively on the test set. Following established methods, we extracted activations from the 19th layer of the model to train the SAE [<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>We used ReasonScore, a quantitative metric, to identify reasoning-specific features from a trained SAE [<xref ref-type="bibr" rid="ref18">18</xref>]. The metric works by identifying features that activate most strongly around predefined reasoning words within a fixed-width context window and using an entropy penalty to penalize features that activate only on limited reasoning words. After identifying the top 100 features with the highest ReasonScore, we manually reviewed their activations and logits using an SAE dashboard [<xref ref-type="bibr" rid="ref22">22</xref>]. For the top 15 features, 2 reviewers (SL and JL) independently examined the top-activating contexts, activation patterns, and logit effects for each feature, then met to discuss and reconcile their functional labels through consensus. This process yielded 5 functional categories, which were then compared with the independently derived error taxonomy.</p><p>To test the impact of these features, we used activation steering [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. This method involves intentionally modifying the model&#x2019;s reasoning by adding a positive bias to the activations of top-performing features (using strengths of 2 and 4). Boosting a feature&#x2019;s activation encourages the model to more strongly incorporate that feature&#x2019;s specific concept into its subsequent processing. We measured the accuracy of the steered model on three medical benchmarks: MedQA, which served as the primary benchmark across all study components; and 2 other widely used benchmarks, MedMCQA (multiple-choice questions from Indian medical entrance examinations) and PubMedQA (yes, no, or maybe questions derived from PubMed abstracts). MedMCQA and PubMedQA were included to test whether steering effects generalize beyond the dataset used for taxonomy development and SAE training. The 26 MedQA items identified as having benchmark integrity issues in the audit phase were retained in the SAE evaluation, as these items affect all steering conditions equally and represent approximately 2% of the test set. In addition, we used an LLM-as-a-judge (OpenAI GPT-5-mini) to assess hallucinations in the reasoning process, categorizing them as factual hallucinations (contradicting known facts) or input hallucinations (contradicting the question prompt) and assigning a severity score from 1 to 3. LLM-generated labels were validated manually on a stratified sample of 100 claims across datasets and severity levels. Expert validation was performed by JL, a physician with an MD degree and clinical experience as an attending in otolaryngology&#x2013;head and neck surgery, as well as research experience in medical informatics. All disagreement cases were independently reviewed by SL, an investigator with a PhD in biomedical informatics and expertise in clinical AI evaluation. The severity correlation was computed at the claim level between the LLM judge&#x2019;s severity rating and the human annotator&#x2019;s independently assigned rating using the same rubric. To assess feature generalizability, we ranked features by their poststeering accuracy (exact match) within each dataset and steering strength, selected the top K features (for K=5, 10, 15, and 20), and computed the set intersection count between each dataset pair. Then, we selected the top 15 features, manually compared reasoning patterns before and after steering, and grouped them based on their potential function in reasoning. We used the <italic>SAELens</italic> package to train the SAE [<xref ref-type="bibr" rid="ref25">25</xref>], and the <italic>SAEDashboard</italic> package to visualize identified features and their impact in Python 3 [<xref ref-type="bibr" rid="ref22">22</xref>]. The overall project pipeline is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the project pipeline for reasoning&#x2010;error taxonomy development. LLM: large language model; QA: question answering.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e90061_fig01.png"/></fig></sec><sec id="s2-3"><title>Statistical Analysis</title><p>The same questions were evaluated under all steering conditions, yielding paired observations. For accuracy (binary outcome), McNemar tests were used for pairwise comparisons between steering strengths. For reasoning token counts, Wilcoxon signed-rank tests were used. For hallucination counts, where the per-question hallucination set differs across conditions, Mann-Whitney <italic>U</italic> tests were used. All pairwise tests were corrected for multiple comparisons using the Holm method. Chi-square tests were used to compare the distribution of hallucination types across conditions. Pearson correlation coefficients were calculated between reasoning token length and performance metrics (accuracy, hallucination frequency, and severity), with statistical significance assessed using <italic>t</italic> tests on the correlation coefficients. All statistical analyses were performed using Python 3 (Python Software Foundation).</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>This study used publicly available, deidentified data and did not require ethics approval.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Benchmark Quality Analysis</title><p>The OpenAI o1 model answered 63 of 1273 questions incorrectly, for an accuracy of 95%. Upon cross-referencing these 63 questions with their original examination banks, we found that 14 (22%) questions were missing figures that were essential for arriving at the correct answer (<xref ref-type="other" rid="box1">Textbox 1</xref>). For instance, one question about a patient&#x2019;s pressure-volume loop explicitly mentioned a figure (<xref ref-type="fig" rid="figure2">Figure 2</xref>), while another describing a patient with retrosternal burning implicitly relied on a figure showing diffuse lung fibrosis to make the correct diagnosis. A complete list of questions that should include figures is provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>An additional 12 (19%) questions contained ambiguities that have since been corrected in the source question banks (<xref ref-type="other" rid="box1">Textbox 1</xref>). For example, a question about a 3-month-old infant with a holosystolic murmur was updated to include &#x201C;tetany is noted when taking the blood pressure,&#x201D; a hallmark sign that clarifies the diagnosis as 22q11 deletion syndrome (DiGeorge syndrome) rather than fetal alcohol syndrome. In another instance, a question involving a 78-year-old woman was revised. The version in MedQA implied that acute cognitive changes suggested a symptomatic urinary tract infection (warranting treatment), whereas the updated version, by noting that the patient continued to exhibit symptoms consistent with Alzheimer dementia, supported a decision for no treatment. Twelve questions (19%) that had serious ambiguity issues, resulting in answers that differ from those provided as correct, are listed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><boxed-text id="box1"><title> Examples of MedQA questions missing original figures and examples of questions with ambiguity issues. Bold text indicates updated information in the current online questions compared with the original questions in MedQA.</title><p><bold>Example 1</bold>. Figure explicitly mentioned and needed to answer correctly, but not present in the benchmark.</p><p>Question: A 72-year-old woman is admitted to the intensive care unit for shortness of breath and palpitations. A cardiac catheterization is performed, and measurements of the left ventricular volume and pressure at different points in the cardiac cycle are obtained. The patient&#x2019;s pressure-volume loop (gray) is shown with a normal pressure-volume loop (black) for comparison. Which of the following is the most likely underlying cause of this patient&#x2019;s symptoms?</p><p>Options:</p><p>A: Mitral valve regurgitation</p><p>B: Increased systemic vascular resistance</p><p>C: Increased ventricular wall stiffness</p><p>D: Impaired left ventricular contractility</p><p><bold>Example 2</bold>. Figure not explicitly mentioned and needed to answer correctly, but not present in the benchmark.</p><p>Question: A 43-year-old woman presents with complaints of retrosternal burning associated with eating. It has persisted for the past several years but has been getting worse. Her past medical history is unknown and this is her first time seeing a doctor. She states she is otherwise healthy and review of systems is notable for episodic hand pain that is worse in the winter as well as a chronic and severe cough with dyspnea, which she attributes to her smoking. Her temperature is 97.7 &#x00B0;F (36.5 &#x00B0;C), blood pressure is 174/104 mmHg, pulse is 80/min, respirations are 22/min, and oxygen saturation is 92% on room air. Physical exam is notable for a young appearing woman with coarse breath sounds. Laboratory studies and urinalysis are ordered and currently pending. Which of the following is the pathophysiology of this patient&#x2019;s chief complaint?</p><p><bold>Options:</bold></p><p>A: Decreased lower esophageal tone</p><p>B: Esophageal fibrosis</p><p>C: Increased lower esophageal tone</p><p>D: Spastic cricopharyngeal muscle</p><p>Note: The original question included a chest computed tomography image (<xref ref-type="fig" rid="figure2">Figure 2</xref>), which is necessary to answer this question.</p><p><bold>Example 3</bold>. A question with ambiguity issues.</p><p>MedQA question: A 3-month-old infant is brought to her pediatrician because she coughs and seems to have difficulty breathing while feeding. In addition, she seems to have less energy compared to other babies and appears listless throughout the day. She was born by cesarean section to a G1P1 woman with no prior medical history and had a normal APGAR score at birth. Her parents say that she has never been observed to turn blue. Physical exam reveals a high-pitched holosystolic murmur that is best heard at the lower left sternal border. The most likely cause of this patient&#x2019;s symptoms is associated with which of the following abnormalities?</p><p>Options:</p><p>A: 22q11 deletion</p><p>B: Deletion of genes on chromosome 7</p><p>C: Lithium exposure in utero</p><p>D: Maternal alcohol consumption</p><p>Updated question: A 3-month-old infant is brought to her pediatrician because she coughs and seems to have difficulty breathing while feeding. In addition, she seems to have less energy compared to other babies and appears listless throughout the day. She was born by cesarean section to a G1P1 woman with no prior medical history and had a normal APGAR score at birth. Her parents say that she has never been observed to turn blue. Physical exam reveals a high-pitched holosystolic murmur that is best heard at the lower left sternal border. Tetany is noted when taking the blood pressure. The most likely cause of this patient&#x2019;s symptoms is associated with which of the following abnormalities?</p><p>MedQA question: A 78-year-old woman presents to the office for an annual health check-up with her family physician accompanied by her daughter. She has no complaints during this visit but her daughter states that she is having difficulty locating objects such as the television remote, car keys, and her purse. Her medical history is significant for Alzheimer&#x2019;s dementia, coronary artery disease, diabetes mellitus, hypothyroidism, congestive heart failure, osteoarthritis, and centrilobular emphysema. The patient takes memantine, atorvastatin, metformin, levothyroxine, lisinopril, aspirin, albuterol, and ipratropium. The patient&#x2019;s vitals are within normal limits today. Physical exam reveals an elderly female in no acute distress, oriented to person, place, and year, but not to month or day of the week. She has a 3/6 holosystolic murmur at the left sternal border along with an S3 gallop. There are mild crackles at the lung bases. The remainder of the exam is normal. A previous urine culture reports growth of &#x003E; 100,000 CFU of Enterobacter. Urinalysis findings are offered below:</p><p>Leukocyte esterase positive</p><p>WBCs 50&#x2010;100 cell/HPF</p><p>Nitrites positive</p><p>RBCs 2 cell/HPF</p><p>Epithelial cells 2 cell/HPF</p><p>Urine pH 5.7</p><p>Which of the following is the most appropriate next step?</p><p>A: TMP-SMX</p><p>B: Nitrofurantoin</p><p>C: Levofloxacin</p><p>D: No treatment is necessary</p><p>Updated question: A 78-year-old woman presents with her daughter for an annual health check-up with her family physician. The patient has no complaints during this visit but her daughter states that the patient continues to manifest symptoms consistent with her known diagnosis of Alzheimer dementia, including having difficulty locating objects such as her television remote, car keys, and purse. Her other medical history is significant for coronary artery disease, diabetes mellitus, hypothyroidism, congestive heart failure, osteoarthritis, and centrilobular emphysema. The patient takes memantine, atorvastatin, metformin, levothyroxine, lisinopril, aspirin, albuterol, and ipratropium. The patient&#x2019;s vitals are within normal limits. Physical exam reveals no acute distress. She is oriented to person, place, and year, but not to month or day of the week. She has a 3/6 holosystolic murmur at the left sternal border along with an S3 gallop. There are mild crackles at the lung bases. The remainder of the exam is normal. Two urine cultures performed in one week each reported growth of &#x003E; 100,000 CFU of Enterobacter. Urinalysis findings are presented below:</p><p>Leukocyte esterase positive</p><p>WBCs 50&#x2010;100 cell/HPF</p><p>Nitrites positive</p><p>RBCs 2 cell/HPF</p><p>Epithelial cells 2 cell/HPF</p><p>Urine pH 5.7</p><p>Which of the following is the most appropriate next step?</p><p>A: TMP-SMX</p><p>B: Nitrofurantoin</p><p>C: Levofloxacin</p><p>D: No treatment is necessary</p></boxed-text><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Chest computed tomography image originally accompanying example 2 in MedQA. This figure was present in the source question bank but absent from the MedQA benchmark dataset.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e90061_fig02.png"/></fig><p>Across the 26 flawed MedQA items, neither o1 nor o3-mini explicitly identified missing figures or ambiguity in any case, suggesting that earlier-generation reasoning models lack reliable question-integrity recognition. In contrast, GPT-5.2-pro-2025-12-11 flagged 5 questions as missing required figures and identified 1 question with unresolved ambiguity, indicating an emerging but still limited ability to detect underspecified or unanswerable items. Overall, explicit detection of unanswerable or underspecified questions was infrequent, indicating that question-integrity recognition remains limited and model-dependent for medical questions.</p></sec><sec id="s3-2"><title>Taxonomy of Reasoning Errors</title><p>After excluding questions with missing figures or ambiguity, 37 questions (14 from USMLE step 1 examinations and 23 from USMLE step 2 and step 3 examinations) remained for our error analysis. From these, we identified four major error categories: Information Synthesis Errors, Therapeutic Decision Errors, Diagnostic Reasoning Errors, and Foundational Principle Errors. The definitions and examples for each category are detailed in <xref ref-type="other" rid="box2">Textbox 2</xref>.</p><boxed-text id="box2"><title> Taxonomy of reasoning errors with definitions and examples. COPD: chronic obstructive pulmonary disease.</title><p><bold>Information Synthesis Errors:</bold></p><list list-type="bullet"><list-item><p>Misjudgment of Clinical Feature Importance:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: overemphasizes irrelevant details while ignoring critical ones.</p></list-item><list-item><p>Example 1: in a lung cancer case involving a 76-year-old man with COPD and asbestos exposure, the reasoning process overemphasized the patient&#x2019;s asbestos exposure and pleural plaques, while neglecting symptoms such as weight loss and anemia, and his 60-pack-year smoking history that was more indicative of malignancy.</p></list-item><list-item><p>Example 2: in a case of chronic abdominal pain with multiorgan involvement, the focus was incorrectly placed on a recent impetigo episode and the possibility of poststreptococcal glomerulonephritis, thereby overlooking systemic evidence pointing to a chronic condition such as secondary amyloidosis.</p></list-item></list><p><bold>Therapeutic Decision Errors:</bold></p><list list-type="bullet"><list-item><p>Misapplication of Evidence-Based Guidelines:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: fails to select or correctly apply clinical guidelines.</p></list-item><list-item><p>Example 1: the reasoning process overemphasized the use of an adjunctive drug for tumor lysis syndrome while neglecting intravenous hydration, the primary strategy for managing this condition.</p></list-item><list-item><p>Example 2: the reasoning process incorrectly selected griseofulvin despite guidelines recommending itraconazole as the first-line therapy for patients with the fungal infection <italic>tinea corporis</italic>.</p></list-item></list><list list-type="bullet"><list-item><p>Inadequate Dynamic Risk-Benefit Assessment:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: fails to weigh evolving clinical risks and benefits.</p></list-item><list-item><p>Example 1: in a case of circulatory electrolyte imbalance in hepatic encephalopathy, the reasoning process prioritized correcting hypoglycemia while neglecting hypokalemia.</p></list-item><list-item><p>Example 2: in a case of hemoptysis with thrombolytic therapy, the reasoning process did not adequately account for the bleeding risks associated with thrombolytic therapy, underestimating the potential for life-threatening hemorrhage.</p></list-item></list><list list-type="bullet"><list-item><p>Misinterpretation of Pharmacologic Mechanisms:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: misunderstands a medication&#x2019;s mechanism of action.</p></list-item><list-item><p>Example 1: in a patient with seasonal allergies, the reasoning process incorrectly identified the drug&#x2019;s mechanism as competitive blockade of muscarinic receptors (used in asthma management) instead of recognizing that the appropriate decongestant works as an &#x03B1;-adrenergic agonist.</p></list-item><list-item><p>Example 2: in a gout prophylaxis scenario, pancytopenia was erroneously attributed to colchicine toxicity rather than correctly identifying the mechanism of a xanthine oxidase inhibitor and its potential interaction with immunosuppressive agents.</p></list-item></list><list list-type="bullet"><list-item><p>Premature Cognitive Closure:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: a reasoning error in which the model quickly settles on a diagnosis or management plan without sufficiently exploring alternative explanations or contributing factors.</p></list-item><list-item><p>Example: in a case of erectile dysfunction in a patient on selective serotonin reuptake inhibitors with significant vascular risk factors, the reasoning process prematurely closed off further investigation (eg, nocturnal penile tumescence testing) that could better clarify the underlying cause.</p></list-item></list><p><bold>Diagnostic Reasoning Errors:</bold></p><list list-type="bullet"><list-item><p>Failure to Integrate Pathophysiological Mechanisms:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: fails to synthesize and apply key pathophysiological principles, resulting in misattribution or misunderstanding of clinical findings.</p></list-item><list-item><p>Example 1: when analyzing the relevant option, the model failed to link the characteristic facial anomalies (eg, low-set ears and retrognathia) with the Potter sequence.</p></list-item><list-item><p>Example 2: the model overlooked that achlorhydria in a vasoactive intestinal peptide-secreting tumor (VIPoma) leads to impaired iron absorption, failing to integrate this pathophysiological mechanism into its diagnostic reasoning.</p></list-item></list><list list-type="bullet"><list-item><p>Deviation from Prioritized Diagnostic Protocols:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: initiates treatment before completing necessary diagnostic workups.</p></list-item><list-item><p>Example 1: when evaluating a 2-month-old infant with signs of head trauma and suspicious injury patterns, the model prioritized nonmedical interventions (eg, involving social services) over obtaining an urgent head computed tomography (CT) scan, delaying critical diagnostic evaluation.</p></list-item><list-item><p>Example 2: in the initial management of a transient ischemic attack, if the CT scan is normal, a CT angiogram is indicated to further characterize the cerebral vessels. The reasoning process, however, wrongly opted for heparin therapy.</p></list-item></list><p><bold>Foundational Principle Errors:</bold></p><list list-type="bullet"><list-item><p>Misapplication of Ethical Principles:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: fails to follow proper ethical protocols.</p></list-item><list-item><p>Example 1: in the case of a surgical complication, the reasoning process failed to follow proper communication protocols by not discussing the complication with the attending physician before reporting it.</p></list-item><list-item><p>Example 2: in the situation where a daughter refuses consent due to an abuse history, the reasoning process neglected to involve the appropriate legal guardian, instead opting to seek immediate court intervention without first contacting the next of kin.</p></list-item></list><list list-type="bullet"><list-item><p>Misinterpretation of Statistical Concepts:</p></list-item></list><list list-type="bullet"><list-item><p>Definition: misinterprets statistical principles.</p></list-item><list-item><p>Example 1: the reasoning process failed to recognize that the <italic>P</italic> value is computed under the assumption that the null hypothesis is true.</p></list-item><list-item><p>Example 2: the reasoning process failed to differentiate between lead-time bias and measurement bias.</p></list-item></list></boxed-text></sec><sec id="s3-3"><title>Comparison of Errors Across Different LLMs</title><p>We tested other leading models on the 37 challenging questions. Accuracy was 49% for o3-mini, 41% for GPT-4.5, and 38% for DeepSeek-R1. Reasoning process lengths varied significantly, from an average of 1319 characters for o3-mini to 11,055 characters for DeepSeek-R1.</p><p>An analysis of the reasoning traces revealed that different models used highly distinct problem-solving strategies, especially on questions requiring multistep synthesis (<xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref>). For example, one question required understanding that immunoglobulin A (IgA) deficiency is associated with celiac disease and that these patients are typically deficient in fat-soluble vitamins.</p><p>As shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>, the o1, GPT-4.5, and o3-mini models attempted a linear process but failed for different reasons. They first identified the key patient details, then developed an initial differential diagnosis, and finally assessed the available answer options. The o1 model recognized the possibility of IgA deficiency and evaluated each option, linking option A (Hemolytic anemia and ataxia) with vitamin E deficiency. However, it did not connect the vitamin E deficiency back to IgA deficiency. In the GPT-4.5 process, the analysis of option A led directly to its association with certain hereditary disorders, such as Friedreich ataxia and other conditions involving neurological and hematological symptoms, resulting in the elimination of what was actually the correct choice. In contrast, the o3-mini model exhibited a shortcut: it noted, &#x201C;Although the answer choices do not directly mention IgA deficiency or anti-IgA antibodies, the only option referring to antibodies (which in this context are responsible for transfusion reactions) is option D: anti-A, B, or O antibodies in the serum.&#x201D;</p><p>Meanwhile, DeepSeek-R1&#x2019;s process was fundamentally different (<xref ref-type="fig" rid="figure4">Figure 4</xref>). It used an iterative, 3-round analysis, repeatedly reassessing the problem, highlighting a completely distinct cognitive architecture from the other models. The detailed reasoning processes for each model are listed in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><p>When tested on questions that o1 failed, the models displayed varied error patterns (<xref ref-type="table" rid="table1">Table 1</xref>). Information Synthesis Errors were most frequent in OpenAI o1 (11 errors), while Therapeutic Decision Errors were common across all models. These results highlight model-specific differences in reasoning failure modes.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Examples of identified errors in the reasoning traces of OpenAI o3-mini, OpenAI o1, and GPT-4.5. BP: blood pressure; IgA: immunoglobulin A; IV: intravenous.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e90061_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Examples of identified errors in the reasoning traces of DeepSeek-R1. DIC: disseminated intravascular coagulation; GCS: Glasgow Coma Scale; IgA: immunoglobulin A; IV: intravenous; TRALI: transfusion-related acute lung injury.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e90061_fig04.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Distribution of error types across 4 large language models for 37 challenging questions.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Error category and subcategory</td><td align="left" valign="bottom">OpenAI o1</td><td align="left" valign="bottom">GPT-4.5</td><td align="left" valign="bottom">OpenAI o3-mini</td><td align="left" valign="bottom">Deepseek-R1</td></tr></thead><tbody><tr><td align="left" valign="top">Information Synthesis Errors</td><td align="left" valign="top">11</td><td align="left" valign="top">5</td><td align="left" valign="top">4</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">&#x2003;Misjudgment of Clinical Feature Importance</td><td align="left" valign="top">11</td><td align="left" valign="top">5</td><td align="left" valign="top">4</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Therapeutic Decision Errors</td><td align="left" valign="top">11</td><td align="left" valign="top">6</td><td align="left" valign="top">7</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x2003;Misapplication of Evidence-Based Guidelines</td><td align="left" valign="top">4</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;Inadequate Dynamic Risk-Benefit Assessment</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">&#x2003;Misinterpretation of Pharmacologic Mechanisms</td><td align="left" valign="top">3</td><td align="left" valign="top">4</td><td align="left" valign="top">3</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;Premature Cognitive Closure</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Diagnostic Reasoning Errors</td><td align="left" valign="top">9</td><td align="left" valign="top">8</td><td align="left" valign="top">4</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">&#x2003;Failure to Integrate Pathophysiological Mechanisms</td><td align="left" valign="top">5</td><td align="left" valign="top">6</td><td align="left" valign="top">3</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">&#x2003;Deviation from Prioritized Diagnostic Protocols</td><td align="left" valign="top">4</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">Foundational Principle Errors</td><td align="left" valign="top">6</td><td align="left" valign="top">3</td><td align="left" valign="top">4</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">&#x2003;Inappropriate Ethical Decision-Making</td><td align="left" valign="top">4</td><td align="left" valign="top">2</td><td align="left" valign="top">3</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">&#x2003;Misinterpretation of Statistical Concepts</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top">1</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">37</td><td align="left" valign="top">22</td><td align="left" valign="top">19</td><td align="left" valign="top">23</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>Feature Steering via SAE</title><p>Steering with reasoning-specific features enhanced model accuracy, with significant gains on MedQA (<xref ref-type="table" rid="table2">Table 2</xref>; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=10.9; <italic>P</italic>=.002) and PubMedQA (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=18.6; <italic>P</italic>&#x003C;.001) and a consistent positive trend on MedMCQA (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=3.9; <italic>P</italic>=.15). The most substantial gains were observed at a moderate steering strength of 2, which increased MedQA accuracy from 0.568 to 0.597 (95% CI 0.584-0.610) and PubMedQA accuracy from 0.708 to 0.739 (95% CI 0.722-0.756). This performance improvement, however, coincided with a significant increase in the verbosity of the model&#x2019;s reasoning traces. At a steering strength of 4, the average reasoning token count nearly doubled for MedQA (Wilcoxon W=6,010,329; <italic>P</italic>&#x003C;.001<italic>)</italic> and tripled for PubMedQA (Wilcoxon W=257,830; <italic>P</italic>&#x003C;.001). The intervention had a limited effect on hallucination frequency; a significant increase was observed for MedMCQA at strength 2 (Mann-Whitney <italic>U</italic>=1,364,912; <italic>P</italic>=.008), with no significant changes for MedQA (<italic>U</italic>=481,828; <italic>P</italic>=.99) or PubMedQA (<italic>U</italic>=33,695; <italic>P</italic>=.99). Notably, the type of hallucination remained consistent within each dataset regardless of steering. Chi-square analysis revealed a significant shift in the distribution of hallucination types for MedMCQA (<italic>&#x03C7;</italic>&#x00B2;<sub>2</sub>=8.1; <italic>P</italic>=.02), but not for MedQA (<italic>&#x03C7;</italic>&#x00B2;<sub>2</sub>=1.4; <italic>P</italic>=.49) or PubMedQA (<italic>&#x03C7;</italic>&#x00B2;<sub>2</sub>=1.1; <italic>P</italic>=.58). Across all conditions, factual hallucinations were the most common type in MedMCQA and MedQA, whereas input hallucinations predominated in PubMedQA (Table 4). Finally, no significant correlations were identified between reasoning length and key performance metrics such as accuracy, hallucination count, or hallucination severity. These correlations were computed at the level of individual questions within each steering condition. Expert validation achieved 96% precision, with the 4 disagreement cases representing conservative overflagging of borderline medical claims rather than clear errors. The claim-level Pearson correlation between the LLM judge and human annotator severity ratings was 0.86.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Model performance across steering strengths on 3 medical question-answering (QA) benchmarks. Values are reported as point estimates with 95% CIs computed from question-level paired data.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset and strength</td><td align="left" valign="bottom">Accuracy (95% CI)</td><td align="left" valign="bottom">Reasoning<break/>tokens (95% CI)</td><td align="left" valign="bottom">Hallucinations<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup><break/>(95% CI)</td><td align="left" valign="bottom">Factual<break/>hallucinations<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (95% CI)</td><td align="left" valign="bottom">Severity<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup><break/>(0&#x2010;3)<break/>(95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="6">MedMCQA</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None</td><td align="left" valign="top">0.505 (0.503-0.507)</td><td align="left" valign="top">1287<break/>(1237-1338)</td><td align="left" valign="top">2.2<break/>(2.154-2.247)</td><td align="left" valign="top">2.15 (2.104-2.197)</td><td align="left" valign="top">2.199<break/>(2.186-2.212)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">0.522 (0.508-0.536)</td><td align="left" valign="top">1500<break/>(1294-1706)</td><td align="left" valign="top">2.378<break/>(2.284-2.472)</td><td align="left" valign="top">2.284 (2.191-2.377)</td><td align="left" valign="top">2.194<break/>(2.168-2.221)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">0.525 (0.511-0.538)</td><td align="left" valign="top">2457<break/>(2065-2850)</td><td align="left" valign="top">2.163<break/>(2.066-2.260)</td><td align="left" valign="top">2.107 (2.011-2.203)</td><td align="left" valign="top">2.203<break/>(2.176-2.230)</td></tr><tr><td align="left" valign="top" colspan="6">MedQA</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None</td><td align="left" valign="top">0.568 (0.565-0.570)</td><td align="left" valign="top">1682<break/>(1587-1777)</td><td align="left" valign="top">2.605<break/>(2.515-2.695)</td><td align="left" valign="top">2.488 (2.398-2.577)</td><td align="left" valign="top">2.265<break/>(2.241-2.289)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">0.597 (0.584-0.610)</td><td align="left" valign="top">1886<break/>(1671-2101)</td><td align="left" valign="top">2.557<break/>(2.456-2.658)</td><td align="left" valign="top">2.456 (2.356-2.556)</td><td align="left" valign="top">2.250<break/>(2.223-2.277)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">0.589 (0.575-0.602)</td><td align="left" valign="top">2912<break/>(2520-3303)</td><td align="left" valign="top">2.658<break/>(2.554-2.762)</td><td align="left" valign="top">2.531 (2.429-2.633)</td><td align="left" valign="top">2.225 (2.198-2.252)</td></tr><tr><td align="left" valign="top" colspan="6">PubMedQA</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>None</td><td align="left" valign="top">0.708 (0.704-0.712)</td><td align="left" valign="top">670<break/>(652-687)</td><td align="left" valign="top">1.064<break/>(0.954-1.174)</td><td align="left" valign="top">0.384 (0.309-0.459)</td><td align="left" valign="top">1.837<break/>(1.785-1.888)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">0.739 (0.722-0.756)</td><td align="left" valign="top">861<break/>(731-991)</td><td align="left" valign="top">1.08<break/>(0.970-1.190)</td><td align="left" valign="top">0.398 (0.319-0.477)</td><td align="left" valign="top">1.787<break/>(1.735-1.839)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">0.737 (0.719-0.754)</td><td align="left" valign="top">2187<break/>(1696-2677)</td><td align="left" valign="top">0.962<break/>(0.855-1.069)</td><td align="left" valign="top">0.336 (0.265-0.407)</td><td align="left" valign="top">1.734<break/>(1.677-1.790)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Average count of hallucination claims per reasoning trace.</p></fn><fn id="table2fn2"><p><sup>b</sup>Average count of factual hallucination claims per reasoning trace.</p></fn><fn id="table2fn3"><p><sup>c</sup>0&#x2010;3 rating by a large language model judge (GPT-5-mini) and higher = worse.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Functional Analysis of Reasoning Features</title><p>Feature overlap analysis revealed distinct generalization patterns across medical domains. At strength 4, four features (IDs: 10602, 29040, 37660, and 56984) consistently ranked in the top 20 across all 3 datasets, suggesting robust utility. MedQA and MedMCQA showed the strongest feature similarity, sharing 13 features in their top 20 at strength 4, while PubMedQA demonstrated more domain-specific feature requirements. Higher SAE strength (4 vs 2) consistently improved cross-dataset feature generalization. Feature overlap heatmaps are shown in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Manual analysis of the top 15 features allowed us to group them into 5 functional categories that aligned with our error taxonomy (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). These categories included (1) cue-weighting calibration and distractor suppression, which focuses the model on pivotal clinical data; (2) protocol alignment, which steers outputs toward guideline-consistent procedures; (3) mechanistic grounding, which connects decisions to core pathophysiological or pharmacological principles; (4) rule/criteria enforcement, which ensures the precise application of formal definitions such as Light&#x2019;s criteria; and (5) evidence synthesis and question reframing, which helps the model correctly interpret the core question being asked. Several features were multifunctional, contributing to more than one reasoning category (eg, feature 56,984).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>In this study, we conducted a mixed methods analysis to audit the integrity of a widely used medical benchmark, characterize LLM reasoning errors, and test a mechanistic intervention using SAEs. Our findings reveal that a significant portion of apparent model failures on MedQA are attributable to intrinsic data flaws, true reasoning errors can be categorized into a clinically relevant taxonomy, and steering reasoning-specific features can effectively improve LLM accuracy on medical QA benchmarks.</p></sec><sec id="s4-2"><title>Rethinking Benchmarks: The Problem Lies in the Questions</title><p>An important finding of our work is that 41% of the initial incorrect answers generated by OpenAI o1 were due to flawed benchmark questions, including 14 missing necessary figures and 12 containing ambiguities that have since been updated in their source question banks. These integrity findings are specific to MedQA; we did not perform the same source reconciliation for MedMCQA or PubMedQA.</p><p>This audit indicates that MedQA, as instantiated in widely used benchmark distributions, contains a nontrivial fraction of items whose fidelity to the original sources is compromised (eg, missing figures or subsequently corrected ambiguity). These issues can confound model evaluation by attributing errors to models that may instead reflect dataset drift or modality loss introduced during benchmark construction.</p><p>More broadly, these findings underscore a structural risk in deriving medical benchmarks from examination question banks without preserving all clinically relevant modalities. In clinical problem-solving, visual data (eg, imaging, waveforms, or figures) are often integral to reasoning, and their omission fundamentally alters the task being evaluated. Under these conditions, a model&#x2019;s apparent success on a flawed question may plausibly arise from pattern matching rather than robust clinical reasoning. This concern is consistent with prior work showing substantial performance drops when multiple-choice shortcuts are disrupted, such as replacing the correct option with &#x201C;none of the above,&#x201D; which reduced accuracy by 8%&#x2010;38% [<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>As the field relies heavily on such benchmarks to gauge progress, our findings serve as a critical call to action for improved validation and management of medical benchmarks. Unlike static domains such as mathematics or coding, medical knowledge is constantly evolving (eg, clinical practice guidelines are updated and new drugs emerge) [<xref ref-type="bibr" rid="ref26">26</xref>]. To remain relevant, medical benchmarks must be treated as dynamic resources, continuously aligned with the current state of practice. This necessitates a move toward more rigorously curated, version-controlled, and multimodally complete datasets, supported by automated methods for ongoing validation. In addition, when maintainers of source question banks identify and correct errors, a corresponding process must be in place to update the benchmark promptly. Without such reliable evaluation tools, we risk overestimating the capabilities of current models and misdirecting development efforts.</p></sec><sec id="s4-3"><title>A Taxonomy of Reasoning Failures: From &#x201C;What&#x201D; to &#x201C;Why&#x201D;</title><p>By isolating the 37 model errors, we developed a 4-category taxonomy that shifts the focus from whether a model is wrong to why it is wrong. The prevalence of Information Synthesis Errors (particularly for OpenAI o1) and Therapeutic Decision Errors across all tested models highlights vulnerabilities in processes central to clinical practice: weighing evidence, applying guidelines, and performing dynamic risk-benefit assessments. The observation that different state-of-the-art models exhibit distinct error profiles (<xref ref-type="table" rid="table1">Table 1</xref>) suggests that their underlying architectures and training data instill unique cognitive biases. For instance, some models may be prone to premature closure, while others struggle to integrate complex pathophysiological mechanisms. This granular, qualitative understanding of failure modes is essential for identifying high-risk scenarios and is a necessary prerequisite for building safer, more reliable systems.</p></sec><sec id="s4-4"><title>A Mechanistic Path Toward Safer AI: From Observation to Intervention</title><p>Another contribution of this study is the bridge between the qualitative error taxonomy and the quantitative SAE intervention. By identifying and steering reasoning-specific features, we not only improved accuracy on multiple medical benchmarks but also uncovered the functional roles of these features. Some clear alignments were identified: feature groups such as &#x201C;prioritizes critical information and filters out irrelevant data&#x201D; and &#x201C;protocol alignment&#x201D; mechanistically address the error categories of &#x201C;Misjudgment of Clinical Feature Importance&#x201D; and &#x201C;Deviation from Prioritized Diagnostic Protocols,&#x201D; respectively (<xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>). These results provide an initial proof of concept that interpretable, reasoning-specific SAE features can be modulated to shift overall accuracy and that the resulting feature categories conceptually correspond to several of the error types in our taxonomy. They do not, however, demonstrate that steering individual feature subgroups causally corrects the specific error categories with which they conceptually align; such a claim would require single-feature or feature-group interventions on a larger error-annotated corpus drawn from the same model, which we identify as a key direction for future work.</p><p>This intervention also revealed a trade-off. Steering with strengthened reasoning-specific features increased accuracy, particularly on MedQA and PubMedQA, but also significantly lengthened the reasoning traces, more than doubling the token count in some cases. Notably, however, the relationship between steering strength and accuracy was nonmonotonic: increasing strength from 2 to 4 more than doubled token counts across all 3 benchmarks, yet accuracy slightly declined in some cases (eg, MedQA: 0.597-0.589). This pattern, combined with the absence of significant correlations between reasoning trace length and any performance metric, indicates that improved accuracy is not a simple byproduct of increased verbosity or test-time computation. Rather, it likely results from the targeted activation of specific, high-value reasoning pathways. The distinction between group-level and individual-level effects is important here: while steering shifts the mean accuracy and mean trace length upward relative to baseline, within a given condition, longer traces do not predict correct answers for individual questions, further suggesting that the benefit is feature-specific rather than length-dependent. We note that prompt-based interventions designed to elicit longer reasoning operate at the input level rather than directly modifying internal representations and therefore target a different causal pathway than activation steering; exploring such comparisons remains a valuable direction for future work. More broadly, the increased token cost associated with steering presents a practical design challenge. This work represents a proof of concept; potential strategies for mitigating this trade-off in future implementations could include selective steering for high-uncertainty cases or adaptive strength calibration.</p></sec><sec id="s4-5"><title>Strengths and Limitations</title><p>This study&#x2019;s primary strength lies in its novel mixed methods approach, which combines a qualitative analysis of benchmark integrity and LLM reasoning errors with a quantitative, mechanistic intervention using SAEs. This provides a uniquely holistic view of the challenges and opportunities in improving LLM performance on medical reasoning benchmarks.</p><p>However, several limitations remain. First, we used different models across study stages: the error taxonomy was developed on frontier models (eg, OpenAI o1), while SAE training and steering were conducted on a single open-source distilled model (DeepSeek-R1-Distill-Llama-8B). Because distilled student models can develop internal representations distinct from their teachers [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], the alignment between our taxonomy and the 8B model&#x2019;s SAE features is suggestive rather than direct mechanistic evidence. Consequently, these identified features are not directly transferable to closed-source models. Second, our error taxonomy is based on 37 incorrect responses from the o1 model, which may not capture the full spectrum of reasoning failures. Specifically, it misses &#x201C;silent failures,&#x201D; cases where incorrect reasoning coincidentally yields a correct answer. Characterizing these hidden errors through expert review of complete reasoning traces remains an important direction for future work. Third, using an LLM-as-a-judge to evaluate hallucinations is scalable but introduces potential biases. Our human validation revealed conservative overflagging by the LLM judge. Furthermore, comprehensively estimating recall (false negatives) was infeasible because of the massive data volume and specialized clinical expertise required. As a result, our reported hallucination counts should be interpreted as lower-bound estimates. Finally, there are technical and evaluative constraints regarding the SAE intervention. Our evaluation relies on QA benchmarks, which do not fully replicate the dynamic, unstructured nature of real-world clinical decision-making. Technically, the SAE training was limited to a single layer (layer 19) and faced common interpretability challenges, such as incomplete resolution of polysemanticity and the difficulty of completely disentangling reasoning from factual features. Future work should compare this approach against steering random, nonreasoning features and extend evaluations to real patient data.</p></sec><sec id="s4-6"><title>Future Work</title><p>Our findings point toward several key directions for future research. First, there is an urgent need for the community to develop validation and management methods for maintaining medical benchmarks, ensuring fair and reproducible evaluations. Second, the reasoning-specific features identified by SAEs could be integrated into more advanced training paradigms. For example, they could inform process-based reward models in reinforcement learning to explicitly penalize flawed reasoning pathways identified in our taxonomy, potentially training models that are both accurate and efficient. In addition, the error taxonomy developed here could inform the design of targeted medical test cases that intentionally trigger specific failure modes, serving both as a validation tool for the taxonomy and as a stress-testing framework for evaluating model robustness across known error categories. For clinical translation, our work underscores that accuracy is an insufficient metric for safety and reliability. The ability to audit, understand, and even steer a model&#x2019;s reasoning process, as demonstrated here, represents a critical step toward building the &#x201C;glass-box&#x201D; systems necessary to earn clinician trust and ensure patient safety [<xref ref-type="bibr" rid="ref29">29</xref>].</p></sec><sec id="s4-7"><title>Conclusion</title><p>This study identified that 41% of initial model errors on MedQA reflected benchmark integrity issues, including missing figures and subsequently corrected ambiguities, rather than true model failures. Among the 37 confirmed reasoning errors, inductive analysis yielded a 4-category taxonomy (Information Synthesis, Therapeutic Decision, Diagnostic Reasoning, and Foundational Principle Errors) that revealed distinct failure profiles across 4 frontier LLMs. Steering reasoning-specific SAE features significantly improved accuracy on MedQA and PubMedQA, with a consistent positive trend on MedMCQA, while also increasing reasoning trace length, with no significant correlation between verbosity and performance. These findings demonstrate that medical LLM evaluation is constrained by flawed benchmarks and that reasoning failures follow identifiable, model-specific patterns with the potential for mechanistic correction via feature steering.</p></sec></sec></body><back><ack><p>The authors used generative artificial intelligence solely to identify key citations in the literature review. These systems were not involved in data collection, data analysis, study design, or paper drafting. The authors have reviewed the paper and accept full responsibility for it.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the National Institutes of Health (NIH) grant R00LM014097-02. The NIH had no role in the design and conduct of the study; the collection, management, analysis, and interpretation of the data; the preparation, review, or approval of the paper; and the decision to submit the paper for publication.</p></sec><sec><title>Data Availability</title><p>The prompts are listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. MedQA questions with missing figures and ambiguous MedQA questions updated in source question banks are reported in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref> and <xref ref-type="supplementary-material" rid="app3">3</xref>. The datasets used in this study are publicly available: MedQA (GitHub), MedMCQA (GitHub), and PubMedQA (GitHub). The training data for the sparse autoencoder are publicly available: LMSYS-Chat-1M (Hugging Face) and OpenThoughts-114k (Hugging Face). The code for the official ReasonScore implementation is available on GitHub.</p></sec></notes><fn-group><fn fn-type="con"><p>SL contributed to conceptualization, data curation, formal analysis, investigation, methodology, software development, and writing of the original draft. JL contributed to conceptualization, data curation, investigation, methodology, and writing of the original draft. AW contributed to the conceptualization, review, and editing of the paper.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">IgA</term><def><p>immunoglobulin A</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">QA</term><def><p>question-answering</p></def></def-item><def-item><term id="abb6">SAE</term><def><p>sparse autoencoder</p></def></def-item><def-item><term id="abb7">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Ouyang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Training language models to follow instructions with human feedback</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 4, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.02155</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Patterson</surname><given-names>BL</given-names> </name><etal/></person-group><article-title>Using AI-generated suggestions from ChatGPT to optimize clinical decision support</article-title><source>J Am Med Inform Assoc</source><year>2023</year><month>06</month><day>20</day><volume>30</volume><issue>7</issue><fpage>1237</fpage><lpage>1245</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad072</pub-id><pub-id pub-id-type="medline">37087108</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tai-Seale</surname><given-names>M</given-names> </name><name name-style="western"><surname>Baxter</surname><given-names>SL</given-names> </name><name name-style="western"><surname>Vaida</surname><given-names>F</given-names> </name><etal/></person-group><article-title>AI-generated draft replies integrated into health records and physicians&#x2019; electronic communication</article-title><source>JAMA Netw Open</source><year>2024</year><month>04</month><day>1</day><volume>7</volume><issue>4</issue><fpage>e246565</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.6565</pub-id><pub-id pub-id-type="medline">38619840</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>AP</given-names> </name><etal/></person-group><article-title>Leveraging large language models for generating responses to patient messages&#x2014;a subjective analysis</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>05</month><day>20</day><volume>31</volume><issue>6</issue><fpage>1367</fpage><lpage>1379</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae052</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Van Veen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Uden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blankemeier</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Clinical text summarization: adapting large language models can outperform human experts</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 14, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.07430</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>AP</given-names> </name><etal/></person-group><article-title>Why do users override alerts? Utilizing large language model to summarize comments and optimize clinical decision support</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>05</month><day>20</day><volume>31</volume><issue>6</issue><fpage>1388</fpage><lpage>1396</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae041</pub-id><pub-id pub-id-type="medline">38452289</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>DeepSeek-AI</collab><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><etal/></person-group><article-title>DeepSeek-R1: incentivizing reasoning capability in LLMs via reinforcement learning</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 22, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.12948</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>H</given-names> </name><etal/></person-group><article-title>A preliminary study of o1 in medicine: are we closer to an AI doctor?</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 23, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2409.15277</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name></person-group><article-title>Improving large language model applications in biomedicine with retrieval-augmented generation: a systematic review, meta-analysis, and clinical development guidelines</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>04</month><day>1</day><volume>32</volume><issue>4</issue><fpage>605</fpage><lpage>615</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf008</pub-id><pub-id pub-id-type="medline">39812777</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name></person-group><article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title><source>Appl Sci (Basel)</source><year>2020</year><volume>11</volume><issue>14</issue><fpage>6421</fpage><pub-id pub-id-type="doi">10.3390/app11146421</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raji</surname><given-names>ID</given-names> </name><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Alsentzer</surname><given-names>E</given-names> </name></person-group><article-title>It&#x2019;s time to bench the medical exam benchmark</article-title><source>NEJM AI</source><year>2025</year><month>01</month><day>23</day><volume>2</volume><issue>2</issue><pub-id pub-id-type="doi">10.1056/AIe2401235</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>28</day><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568</pub-id><pub-id pub-id-type="medline">37379067</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>P</given-names> </name><name name-style="western"><surname>Koyejo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>N</given-names> </name></person-group><article-title>Fidelity of medical reasoning in large language models</article-title><source>JAMA Netw Open</source><year>2025</year><month>08</month><day>1</day><volume>8</volume><issue>8</issue><fpage>e2526021</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.26021</pub-id><pub-id pub-id-type="medline">40779272</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name></person-group><article-title>OVM, outcome-supervised value models for planning in mathematical reasoning</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 16, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.09724</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lightman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kosaraju</surname><given-names>V</given-names> </name><name name-style="western"><surname>Burda</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Let&#x2019;s verify step by step</article-title><source>arXiv</source><comment>Preprint posted online on  May 31, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2305.20050</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Cunningham</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ewart</surname><given-names>A</given-names> </name><name name-style="western"><surname>Riggs</surname><given-names>L</given-names> </name><name name-style="western"><surname>Huben</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sharkey</surname><given-names>L</given-names> </name></person-group><article-title>Sparse autoencoders find highly interpretable features in language models</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 15, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.08600</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Galichin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dontsov</surname><given-names>A</given-names> </name><name name-style="western"><surname>Druzhinina</surname><given-names>P</given-names> </name><etal/></person-group><article-title>I have covered all the bases here: interpreting reasoning features in large language models via sparse autoencoders</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 24, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.18878</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dejong</surname><given-names>G</given-names> </name><name name-style="western"><surname>Horn</surname><given-names>SD</given-names> </name><name name-style="western"><surname>Gassaway</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Slavin</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Dijkers</surname><given-names>MP</given-names> </name></person-group><article-title>Toward a taxonomy of rehabilitation interventions: using an inductive approach to examine the &#x201C;black box&#x201D; of rehabilitation</article-title><source>Arch Phys Med Rehabil</source><year>2004</year><month>04</month><volume>85</volume><issue>4</issue><fpage>678</fpage><lpage>686</lpage><pub-id pub-id-type="doi">10.1016/j.apmr.2003.06.033</pub-id><pub-id pub-id-type="medline">15083447</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chiang</surname><given-names>WL</given-names> </name><name name-style="western"><surname>Sheng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>LMSYS-chat-1M: a large-scale real-world LLM conversation dataset</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 21, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.11998</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Guha</surname><given-names>E</given-names> </name><name name-style="western"><surname>Marten</surname><given-names>R</given-names> </name><name name-style="western"><surname>Keh</surname><given-names>S</given-names> </name><etal/></person-group><article-title>OpenThoughts: data recipes for reasoning models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 4, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2506.04178</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>SAEDashboard</article-title><source>GitHub</source><access-date>2025-07-27</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/jbloomAus/SAEDashboard">https://github.com/jbloomAus/SAEDashboard</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Stolfo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Balachandran</surname><given-names>V</given-names> </name><name name-style="western"><surname>Yousefi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name><name name-style="western"><surname>Nushi</surname><given-names>B</given-names> </name></person-group><article-title>Improving instruction-following in language models through activation steering</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 15, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.12877</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Turner</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Thiergart</surname><given-names>L</given-names> </name><name name-style="western"><surname>Leech</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Steering language models with activation engineering</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 20, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2308.10248</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>SAELens</article-title><source>GitHub</source><access-date>2026-06-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/jbloomAus/SAELens">https://github.com/jbloomAus/SAELens</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vernooij</surname><given-names>RWM</given-names> </name><name name-style="western"><surname>Sanabria</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Sol&#x00E0;</surname><given-names>I</given-names> </name><name name-style="western"><surname>Alonso-Coello</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez Garc&#x00ED;a</surname><given-names>L</given-names> </name></person-group><article-title>Guidance for updating clinical practice guidelines: a systematic review of methodological handbooks</article-title><source>Implement Sci</source><year>2014</year><month>01</month><day>2</day><volume>9</volume><issue>1</issue><fpage>3</fpage><pub-id pub-id-type="doi">10.1186/1748-5908-9-3</pub-id><pub-id pub-id-type="medline">24383701</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Haskins</surname><given-names>R</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>B</given-names> </name></person-group><article-title>Distilled circuits: a mechanistic study of internal restructuring in knowledge distillation</article-title><source>arXiv</source><comment>Preprint posted online on  May 16, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.10822</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Baek</surname><given-names>DD</given-names> </name><name name-style="western"><surname>Tegmark</surname><given-names>M</given-names> </name></person-group><article-title>Towards understanding distilled reasoning models: a representational approach</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 5, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.03730</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Peterson</surname><given-names>JF</given-names> </name><etal/></person-group><article-title>Leveraging explainable artificial intelligence to optimize clinical decision support</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>04</month><day>3</day><volume>31</volume><issue>4</issue><fpage>968</fpage><lpage>974</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae019</pub-id><pub-id pub-id-type="medline">38383050</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Large language model prompt for answer and reasoning generation.</p><media xlink:href="jmir_v28i1e90061_app1.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>MedQA questions with missing figures.</p><media xlink:href="jmir_v28i1e90061_app2.docx" xlink:title="DOCX File, 87 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Ambiguous MedQA questions updated in source banks.</p><media xlink:href="jmir_v28i1e90061_app3.docx" xlink:title="DOCX File, 35 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Example of a complex question failed by all large language models (OpenAI o1, OpenAI o3-mini, and OpenAI GPT-4.5, and DeepSeek-R1).</p><media xlink:href="jmir_v28i1e90061_app4.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Feature overlap heatmaps across medical datasets. For each dataset and steering strength, features were ranked by their poststeering accuracy (exact match) on that dataset. The heatmaps show the set intersection count of top-K feature lists between each dataset pair, that is, |Top-K(A) &#x2229; Top-K(B)|, for K=5, 10, 15, and 20 at steering strengths of 2 and 4. Diagonal entries equal K.</p><media xlink:href="jmir_v28i1e90061_app5.docx" xlink:title="DOCX File, 237 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Functional analysis of top 15 reasoning-specific sparse autoencoder features.</p><media xlink:href="jmir_v28i1e90061_app6.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material></app-group></back></article>