<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e74142</article-id><article-id pub-id-type="doi">10.2196/74142</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Evaluating the Reasoning Capabilities of Large Language Models for Medical Coding and Hospital Readmission Risk Stratification: Zero-Shot Prompting Approach</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Naliyatthaliyazchayil</surname><given-names>Parvati</given-names></name><degrees>PharmD, MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Muthyala</surname><given-names>Raajitha</given-names></name><degrees>BPharm</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gichoya</surname><given-names>Judy Wawira</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Purkayastha</surname><given-names>Saptarshi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Biomedical Engineering and Informatics, Luddy School of Informatics, Computing and Engineering, Indiana University Indianapolis</institution><addr-line>535 W Michigan Street</addr-line><addr-line>Indianapolis</addr-line><addr-line>IN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Radiology and Imaging Sciences, Emory University School of Medicine, Emory University</institution><addr-line>Atlanta</addr-line><addr-line>GA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sena</surname><given-names>Jessica</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Milic</surname><given-names>Marko Kimi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Saptarshi Purkayastha, PhD, Department of Biomedical Engineering and Informatics, Luddy School of Informatics, Computing and Engineering, Indiana University Indianapolis, 535 W Michigan Street, Indianapolis, IN, 46202, United States, 1 317 274 0439; <email>saptpurk@iu.edu</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>30</day><month>7</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e74142</elocation-id><history><date date-type="received"><day>21</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>08</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>10</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Parvati Naliyatthaliyazchayil, Raajitha Muthyala, Judy Wawira Gichoya, Saptarshi Purkayastha. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 30.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e74142"/><abstract><sec><title>Background</title><p>Large language models (LLMs) such as ChatGPT-4, LLaMA-3.1, Gemini-1.5, DeepSeek-R1, and OpenAI-O3 have shown promising potential in health care, particularly for clinical reasoning and decision support. However, their reliability across critical tasks like diagnosis, medical coding, and risk prediction has received mixed reviews, especially in real-world settings without task-specific training.</p></sec><sec><title>Objective</title><p>This study aims to evaluate and compare the zero-shot performance of reasoning and nonreasoning LLMs in three essential clinical tasks: (1) primary diagnosis generation, (2) <italic>ICD-9</italic> (<italic>International Classification of Diseases, Ninth Revision</italic>) medical code prediction, and (3) hospital readmission risk stratification. The goal is to assess whether these models can serve as general-purpose clinical decision support tools and to identify gaps in current capabilities.</p></sec><sec sec-type="methods"><title>Methods</title><p>Using the Medical Information Mart for Intensive Care-IV dataset, we selected a random cohort of 300 hospital discharge summaries. Prompts were engineered to include structured clinical content from 5 note sections: chief complaints, past medical history, surgical history, laboratories, and imaging. Prompts were standardized and zero-shot, with no model fine-tuning or repetition across runs. All model interactions were conducted through publicly available web user interfaces, without using application programming interfaces, to simulate real-world accessibility for nontechnical users. We incorporated rationale elicitation into prompts to evaluate model transparency, especially in reasoning models. Ground-truth labels were derived from the primary diagnosis documented in clinical notes, structured <italic>ICD-9</italic> codes from diagnosis, and hospital-recorded readmission frequencies for risk stratification. Performance was measured using <italic>F</italic><sub>1</sub>-scores and correctness percentages, and comparative performance was analyzed statistically.</p></sec><sec sec-type="results"><title>Results</title><p>Among nonreasoning models, LLaMA-3.1 achieved the highest primary diagnosis accuracy (n=255, 85%), followed by ChatGPT-4 (n=254, 84.7%) and Gemini-1.5 (n=237, 79%). For <italic>ICD-9</italic> prediction, correctness dropped significantly across all models: LLaMA-3.1 (n=128, 42.6%), ChatGPT-4 (n=122, 40.6%), and Gemini-1.5 (n=44, 14.6%). Hospital readmission risk prediction showed low performance in nonreasoning models: LLaMA-3.1 (n=124, 41.3%), Gemini-1.5 (n=122, 40.7%), and ChatGPT-4 (n=99, 33%). Among reasoning models, OpenAI-O3 outperformed in diagnosis (n=270, 90%) and <italic>ICD-9</italic> coding (n=136, 45.3%), while DeepSeek-R1 performed slightly better in the readmission risk prediction (n=218, 72.6% vs O3&#x2019;s n=212, 70.6%). Despite improved explainability, reasoning models generated verbose responses. None of the models met clinical standards across all tasks, and performance in medical coding remained the weakest area across all models.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Current LLMs exhibit moderate success in zero-shot diagnosis and risk prediction but underperform in <italic>ICD-9</italic> code generation, reinforcing findings from prior studies. Reasoning models offer marginally better performance and increased interpretability, with limited reliability. Overall, statistical analysis between the models revealed that OpenAI-O3 outperformed the other models. These results highlight the need for task-specific fine-tuning and need human-in-the-loop checking. Future work will explore fine-tuning, stability through repeated trials, and evaluation on a different subset of deidentified real-world data with a larger sample size.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>clinical decision support</kwd><kwd>zero-shot learning</kwd><kwd>medical coding</kwd><kwd>primary diagnosis prediction</kwd><kwd>readmission risk prediction</kwd><kwd>explainability</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The rapid evolution of large language models (LLMs), which are artificial intelligence (AI) systems designed to understand and generate human-like text, has sparked widespread interest in their potential applications across various domains [<xref ref-type="bibr" rid="ref1">1</xref>], particularly health care [<xref ref-type="bibr" rid="ref2">2</xref>]. Alongside established nonreasoning models like ChatGPT-4, LLaMA-3.1, and Gemini-1.5, new reasoning models, such as DeepSeek-R1 and OpenAI-O3, have also emerged during this study, with reasoning capabilities embedded in their design, enabling more logical, step-by-step decision-making. These models enable users to perform complex language-based tasks without domain-specific training, using only natural language input [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>While some initial studies highlight the promising ability of these LLMs to handle complex health care tasks [<xref ref-type="bibr" rid="ref3">3</xref>], others raise critical concerns about their accuracy, reliability, and adherence to the high standards required in clinical settings [<xref ref-type="bibr" rid="ref4">4</xref>]. This duality highlights the need for careful evaluation of their utility and reliability in real-world clinical environments [<xref ref-type="bibr" rid="ref5">5</xref>]. This leads us to key questions in this rapidly advancing field: which of these preconfigured LLMs is most suitable for addressing the unique challenges of health care tasks? Do newer reasoning models outperform their nonreasoning counterparts?</p><p>To address this question, our study systematically compares the performance of 5 models, prominent nonreasoning LLMs ChatGPT-4, LLaMA-3.1, and Gemini-1.5 as well as reasoning models DeepSeek-R1 and OpenAI-O3 across key health care tasks. The nonreasoning models were selected based on their widespread popularity and adoption, while the reasoning models were chosen for their recently introduced, advanced reasoning capabilities at the time of study design. Specifically, we evaluated their aggregated ability to generate primary diagnoses, code it to the <italic>ICD-9</italic> (<italic>International Classification of Diseases, Ninth Revision</italic>) codes, and predict risk stratification for hospital readmission using zero-shot prompting. To increase interpretability, structured rationale elicitation was incorporated into the prompting for diagnostic and prognostic tasks, especially for nonreasoning models.</p><p>In our study context, primary diagnosis refers to the main condition that is chiefly responsible for a patient&#x2019;s current hospitalization. To ensure consistency across health care systems, diagnosis is coded to <italic>ICD-9</italic> or <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) as a standard practice. <italic>ICD-9</italic> and <italic>ICD-10</italic> are standardized coding systems used globally for categorizing diseases, conditions, and medical procedures [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Each diagnosis is assigned a unique numeric or alphanumeric code that codes diagnoses for medical records. Further, we define hospital readmission as the likelihood of a patient being readmitted to the hospital after discharge within the full time frame covered by the dataset.</p><p>This analysis was conducted using the Medical Information Mart for Intensive Care (MIMIC)-IV dataset [<xref ref-type="bibr" rid="ref8">8</xref>], a controlled-access, real-world clinical dataset derived from critical care hospital admissions. Since these LLMs are primarily trained on publicly available internet data [<xref ref-type="bibr" rid="ref9">9</xref>], using a controlled-access, real-world, deidentified clinical dataset is better suited for evaluating their clinical performance. This dataset includes detailed patient, admission, diagnosis, and discharge information. From the discharge summaries, also called clinical notes, we extracted sections like chief complaints, past medical history, surgical history, laboratories, and imaging to construct prompts for model evaluation. The primary diagnosis section was excluded from prompts and instead used as ground truth for evaluating diagnostic predictions. Structured <italic>ICD-9</italic> codes served as the reference for code prediction accuracy, while hospital-recorded readmission counts (via hadm_id) were used to assess readmission risk prediction. Zero-shot prompting was used to evaluate model generalizability without task-specific fine-tuning [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>The objective of this study is to evaluate which preconfigured LLMs are most suitable for addressing the unique challenges of health care tasks and whether newer reasoning models outperform their nonreasoning counterparts in predicting primary diagnoses, medical codes, and readmission risk. Additionally, the study aims to assess the potential role of preconfigured LLMs in supporting clinical decision-making without the need for task-specific fine-tuning. By leveraging real-world health care data from the MIMIC-IV dataset and using zero-shot prompting, we evaluate the models&#x2019; accuracy and effectiveness in a clinical context. Our analysis seeks to paint a clearer picture of the feasibility and limitations of these models for safer and effective health care applications.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study involved secondary analysis of deidentified patient data from MIMIC-IV (version 2.2) and MIMIC-IV Note (version 2.2) databases. The Massachusetts Institute of Technology Institutional Review Board approved MIMIC-IV data use (protocol 0403000206). As the dataset is fully deidentified per Health Insurance Portability and Accountability Act requirements, this research was classified as nonhuman participant research, requiring no additional institutional review board approval.</p></sec><sec id="s2-2"><title>Study Design</title><p>This study used a multistep approach to evaluate the performance of LLMs in addressing key health care tasks, including the ability to predict primary diagnoses, assign <italic>ICD-9</italic> codes, and stratify hospital readmission risks, along with explanations for diagnosis and risk classification. The web-based user interfaces of these LLMs were used, as the study focuses on evaluating readily accessible, out-of-the-box chatbot versions rather than application programming interface (API)&#x2013;based implementations, which may require additional technical skills and incur extra costs. The methodology is organized into 3 key phases, summarized as follows.</p></sec><sec id="s2-3"><title>Sample Collection</title><p>Clinical data were obtained from the controlled-access MIMIC-IV dataset. It is a deidentified dataset containing detailed health information from patients admitted to the emergency department or intensive care units at Beth Israel Deaconess Medical Center in Boston, MA [<xref ref-type="bibr" rid="ref12">12</xref>]. A sample of 300 unique patient IDs was selected, ensuring that each patient had valid diagnosis codes and at least 1 available discharge summary. For each patient, <italic>ICD-9</italic> and <italic>ICD-10</italic> codes were extracted as a CSV list, along with their first discharge note. As <italic>ICD-9</italic> codes were more prevalent in the sample, all <italic>ICD-10</italic> codes were crosswalked to <italic>ICD-9</italic> to minimize data loss. Readmission risk was evaluated by calculating each patient&#x2019;s total number of admissions using hadm_id and admission dates. Of the 300 patients, 150 had multiple admissions, while the remaining 150 had a single admission, as shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. All the subject_ids used in this sample are listed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Sample collection of 300 unique subject_ids. This figure shows that the sample of 300 subject_ids was created from the MIMIC-IV dataset, and then, any <italic>ICD-10</italic> (<italic>International Statistical Classification of Diseases, Tenth Revision</italic>) codes in the sample were crosswalked to the respective <italic>ICD-9</italic> (<italic>International Classification of Diseases, Ninth Revision</italic>) using the UMLS crosswalk. The tables show the structure of the output for ease of understanding. MIMIC: Medical Information Mart for Intensive Care; UMLS: Unified Medical Language System.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74142_fig01.png"/></fig></sec><sec id="s2-4"><title>Prompt Template and Creation</title><p>Prominent sections from labeled discharge summaries in the MIMIC-IV Note database were used to draft prompts [<xref ref-type="bibr" rid="ref13">13</xref>]. For each patient, the following sections were extracted: chief complaints, past medical history, surgical history, laboratories, and imaging, and programmatically formatted into a structured prompt template for LLM evaluation as shown in <xref ref-type="fig" rid="figure2">Figures 2</xref> and <xref ref-type="fig" rid="figure3">3</xref>. The primary diagnosis was also extracted but not included in the prompt; instead, it served as ground truth for evaluating model performance.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Creation of a prompt using sections from discharge summaries or clinical notes. This figure shows how the output from <xref ref-type="fig" rid="figure1">Figure 1</xref> is further used. The key sections from MIMIC-IV clinical notes were used in prompt creation and extracting the primary diagnosis of the sample. <italic>ICD-9: International Classification of Diseases, Ninth Revision</italic>; MIMIC: Medical Information Mart for Intensive Care.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74142_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Prompt template. This figure shows the prompt template that is systematically populated for each subject_id from their notes that were extracted. On the right, you see an example representation of MIMIC-IV Note, which is then populated into its respective sections within the prompt. The note here is an example and not an actual record from MIMIC-IV. LLM: large language model; MIMIC: Medical Information Mart for Intensive Care.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e74142_fig03.png"/></fig><p>To accommodate differences in model context windows, prompt length and content were optimized through preliminary testing. We ensured that essential clinical information was included while keeping the prompt within the context limit among the evaluated models. This balance was critical to maintain fairness across models and to avoid truncation of input. We also tested prompt clarity and effectiveness through pilot runs, refining phrasing and structure to maximize model understanding. Data used in these pilots were excluded from the main research sample. Example prompts are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-5"><title>Collecting and Processing the Response</title><p>All prompts were systematically generated and input into each AI chatbot through their respective web user interfaces. Each prompt was given to each chatbot only once, without repetition, and the memory of the chatbot was disabled to prevent them from learning from each prompt. The generated responses were stored in a CSV file alongside patient metadata. Structured outputs were parsed into individual columns, capturing the primary diagnosis generated by the LLM, a list of <italic>ICD-9</italic> codes associated with the primary diagnosis, the predicted readmission risk status, explanations for the selected primary diagnosis, and justifications for the predicted readmission risk status.</p><p>The final dataset was then prepared for evaluation against the ground truth. Detailed prompt structure and response parsing procedures are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>This study provides a comparative evaluation of leading LLMs, ChatGPT-4, LLaMA-3.1, Gemini-1.5, DeepSeek-R1, and OpenAI-O3 in terms of their ability to perform health care&#x2013;specific tasks. The prompt was created from the key sections of MIMIC-IV clinical notes. The responses produced by LLM were extracted into their individual structured columns for analysis and compared against the ground truth from MIMIC-IV data. The results highlight notable and interesting variations in performance across tasks.</p></sec><sec id="s3-2"><title>Comparing the Prediction of Primary Diagnosis</title><p>The primary diagnosis from each LLM&#x2019;s response was compared against the primary diagnosis extracted from MIMIC-IV clinical notes. We used SciBERT, a pretrained model specifically designed for scientific and medical contexts [<xref ref-type="bibr" rid="ref14">14</xref>]. This makes it particularly adept at processing and understanding domain-specific language, which is essential for comparing medical terminologies.</p><p>The allenai/scibert_scivocab_uncased variant of SciBERT [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>], implemented through the SentenceTransformer framework, was used to generate embeddings for both the ground truth primary diagnosis (from MIMIC-IV clinical notes) and the LLM-predicted diagnosis. The process involved:</p><list list-type="order"><list-item><p>Embedding generation: Both the reference diagnosis and the LLM-generated text were converted into high-dimensional embeddings using SciBERT.</p></list-item><list-item><p>Cosine similarity computation: Cosine similarity was calculated between the 2 embeddings to quantify their semantic similarity. A cosine similarity threshold of 0.7 was chosen to reflect a moderate to high level of semantic similarity, ensuring that predictions captured the intended clinical meaning without requiring exact wording. This threshold provided a practical balance between sensitivity and specificity for our evaluation needs. A threshold of 0.7 classifies predictions: scores &#x2265;0.7 were considered semantically aligned with the ground truth and scores &#x003C;0.7 were categorized as incorrect or divergent predictions.</p></list-item></list><p>Among nonreasoning models, LLaMA-3.1 and ChatGPT-4 exhibited comparable performance, with semantic match rates of 85% (255/300) and 84.9% (254/300), respectively. This marginal difference suggests that both models are similarly capable of aligning with the ground truth diagnoses, outperforming Gemini-1.5, which achieved a match rate of 79% (237/300). Between the reasoning models, OpenAI-O3 exhibited higher performance with a 90% (270/300) match rate, whereas DeepSeek-R1 showed an 85% (255/300) match rate. Reasoning models performed better than the nonreasoning models.</p></sec><sec id="s3-3"><title>Comparing the Prediction of <italic>ICD-9</italic> Code</title><p>To evaluate the accuracy of <italic>ICD-9</italic> code predictions by the LLMs, we performed a systematic comparison against the ground truth codes from the MIMIC-IV dataset, which includes both <italic>ICD-9</italic> codes and <italic>ICD-10</italic> codes. We crosswalked the <italic>ICD-10</italic> codes to <italic>ICD-9</italic> using the Unified Medical Language System [<xref ref-type="bibr" rid="ref16">16</xref>] <italic>ICD-9</italic> to <italic>ICD-10</italic> crosswalk [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. The decision to crosswalk was driven by the relatively small number of <italic>ICD-10</italic> codes present in our sample, ensuring that the majority of original diagnostic codes could be consistently represented for comparison.</p><p>Both the ground truth <italic>ICD-9</italic> codes and LLM-generated codes were converted into CSV lists to ensure uniformity. We then conducted a row-wise comparison to identify matches between the predicted and ground truth <italic>ICD-9</italic> codes.</p><p>In evaluating the ability of the nonreasoning LLMs to predict <italic>ICD-9</italic> codes for primary diagnoses, LLaMA-3.1 correctly predicted <italic>ICD-9</italic> codes for 128 of 300 patients. ChatGPT-4 followed, correctly predicting <italic>ICD-9</italic> codes for 122 of 300 patients. Gemini-1.5 lagged behind, predicting <italic>ICD-9</italic> codes for 44 of 300 patients. These results indicate that LLaMA-3.1 and ChatGPT-4 are comparably effective, but their performance still falls short of the accuracy required for reliable medical coding applications, and this finding aligns with studies in the literature [<xref ref-type="bibr" rid="ref4">4</xref>]. Between the reasoning models, OpenAI-O3 correctly predicted <italic>ICD-9</italic> codes for 136 of 300 patients, whereas DeepSeek-R1 correctly predicted <italic>ICD-9</italic> codes for 121 of 300 patients. The medical coding skills for the reasoning models also lagged far behind the standards expected for clinical practice. Further refinement and training may be needed to enhance the models&#x2019; effectiveness in this domain.</p></sec><sec id="s3-4"><title>Top 10 <italic>ICD-9</italic> Codes in MIMIC-IV Sample and 3 Nonreasoning LLMs</title><p>We evaluated the top 10 <italic>ICD-9</italic> codes from the MIMIC-IV sample and the 3 nonreasoning LLM-generated <italic>ICD-9</italic> codes, as shown in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Each subject_id can have multiple <italic>ICD-9</italic> codes. For this analysis, we implemented an <italic>ICD-9</italic> hierarchical rollup by aggregating detailed diagnosis codes to their respective 3-digit parent categories. For example, specific codes like 414.0 (coronary atherosclerosis) and 414.00 (coronary atherosclerosis of unspecified type of vessel) were rolled up to their broader parent category, 414 (other forms of chronic ischemic heart disease). The top 10 <italic>ICD-9</italic> codes were calculated after this rollup.</p><p>We found that <italic>ICD-9</italic> codes associated with the parent category 414 (other forms of chronic ischemic heart disease) were present across all 3 LLMs and the MIMIC-IV sample as one of the top 2. In contrast, another parent category, 780 (general symptoms), appeared in all 3 LLMs but was absent in the MIMIC-V sample. This suggests that the LLMs were coding many symptoms differently from clinical practice, highlighting an area for potential improvement. Additionally, the parent category for diabetes mellitus was observed in the MIMIC-IV sample, LLaMA-3.1, and ChatGPT-4, but not in Gemini-1.5, which aligns with our findings of <italic>ICD-9</italic> code predictions, where Gemini-1.5 underperformed.</p></sec><sec id="s3-5"><title>Comparing the Prediction of Hospital Readmission Risk Status</title><p>The ground truth for readmission risk from MIMIC-IV was derived as a numeric value representing the total number of readmissions per patient. In contrast, the LLM-generated responses were qualitative, assigning each patient a categorical label of low, medium, or high risk. To enable a meaningful comparison between these 2 formats, the numeric readmission counts were converted into qualitative categories. We applied a quantile-based thresholding approach. Specifically, the distribution of readmission counts across the dataset was used to define 3 categories:</p><list list-type="bullet"><list-item><p>Low risk: Readmission count&#x2264;25th percentile</p></list-item><list-item><p>Medium risk: Readmission count&#x003E;25th percentile and &#x2264;75th percentile</p></list-item><list-item><p>High risk: Readmission count&#x003E;75th percentile</p></list-item></list><p>This categorization ensured consistency between the qualitative model outputs and the quantitative ground truth, allowing for structured evaluation of LLM performance in readmission risk prediction.</p><p>Among nonreasoning models, LLaMA-3.1 had 41.3% (124/300) correct predictions, followed by Gemini-1.5 with 40.7% (122/300) and ChatGPT-4 with 33% (99/300). While LLaMA-3.1 and Gemini-1.5 demonstrated moderate alignment with the ground truth categories, the overall results suggest significant room for improvement. Among the reasoning models, DeepSeek-R1 performed slightly better with 72.6% (218/300) correct risk predictions than OpenAI-O3 with 70.6% (212/300) correct risk predictions. This shows that reasoning models perform better than nonreasoning models for readmission risk prediction.</p></sec><sec id="s3-6"><title><italic>F</italic><sub>1</sub>-Score for <italic>ICD-9</italic> Code Prediction and Readmission Risk Status</title><p>We calculated the multiclass multilabel <italic>F</italic><sub>1</sub>-score for <italic>ICD-9</italic> code prediction and the macroaveraged <italic>F</italic><sub>1</sub>-score for readmission risk stratification for all 5 LLMs. <italic>F</italic><sub>1</sub>-score for <italic>ICD-9</italic> code prediction helps to evaluate how well the model identifies correct codes while avoiding incorrect ones. For readmission risk prediction, <italic>F</italic><sub>1</sub>-scores identify how the LLM balances identifying patients at risk (eg, &#x201C;high risk&#x201D;) while avoiding unnecessary false alarms. As seen in <xref ref-type="table" rid="table1">Table 1</xref>, <italic>F</italic><sub>1</sub>-scores were generally low for both reasoning and nonreasoning models, primarily due to the higher number of false negatives. Among the 3 nonreasoning LLMs, LLaMA-3.1 achieved the highest <italic>F</italic><sub>1</sub>-scores for both <italic>ICD-9</italic> code prediction and readmission risk stratification. Within the reasoning models, OpenAI-O3 had the highest average <italic>F</italic><sub>1</sub>-score across both tasks. This finding highlights the fact that, despite some differences in performance, both reasoning and nonreasoning models exhibited notable levels of false negatives and false positives.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p><italic>F</italic><sub>1</sub>-scores for LLaMA-3.1, ChatGPT-4, and Gemini-1.5<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Chatbot</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score <italic>ICD-9</italic><sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> code prediction</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score readmission prediction</td></tr></thead><tbody><tr><td align="left" valign="top">LLaMA-3.1</td><td align="left" valign="top">0.083</td><td align="left" valign="top">0.412</td></tr><tr><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">0.081</td><td align="left" valign="top">0.322</td></tr><tr><td align="left" valign="top">Gemini-1.5</td><td align="left" valign="top">0.024</td><td align="left" valign="top">0.408</td></tr><tr><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">0.091</td><td align="left" valign="top">0.422</td></tr><tr><td align="left" valign="top">OpenAI-O3</td><td align="left" valign="top">0.122</td><td align="left" valign="top">0.414</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>This table shows the multiclass multilabel <italic>F</italic><sub>1</sub>-score for <italic>ICD-9</italic> (<italic>International Classification of Diseases, Ninth Revision</italic>) prediction and <italic>F</italic><sub>1</sub>-score for hospital readmission risk prediction. <italic>F</italic><sub>1</sub>-scores take into consideration true positives, true negatives, false positives, and false negatives. The <italic>F</italic><sub>1</sub>-scores for <italic>ICD-9</italic> code prediction are low for all large language models due to the increased false nega compared to than the true positives.</p></fn><fn id="table1fn2"><p><sup>b</sup><italic>ICD-9</italic>: <italic>International Classification of Diseases, Ninth Revision</italic>.</p></fn></table-wrap-foot></table-wrap><p>To evaluate whether the performance differences across models were statistically significant, we initially performed pairwise Wilcoxon signed rank tests on per-task accuracy scores (n=3). After applying Bonferroni correction for multiple comparisons, no pairwise differences reached statistical significance (all corrected <italic>P</italic> values&#x003E;.05), as shown in <xref ref-type="table" rid="table2">Table 2</xref>. This lack of significance is likely due to the small number of tasks and limited statistical power. To further assess the robustness of our findings, we also conducted Mann-Whitney <italic>U</italic> tests for independent sample comparisons across all models. The results consistently showed no significant differences between model performances, with <italic>P</italic> values greater than .05 for all pairwise comparisons.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Pairwise Wilcoxon signed rank test results comparing 5 large language models across 3 tasks<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model_1</td><td align="left" valign="bottom">Model_2</td><td align="left" valign="bottom">Wilcoxon_stat</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom"><italic>P</italic> value_bonferroni</td><td align="left" valign="bottom">Significant</td></tr></thead><tbody><tr><td align="left" valign="top">LLaMA-3.1</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">LLaMA-3.1</td><td align="left" valign="top">Gemini-1.5</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">LLaMA-3.1</td><td align="left" valign="top">OpenAI-O3</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Gemini-1.5</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">OpenAI-O3</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Gemini-1.5</td><td align="left" valign="top">OpenAI-O3</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">Gemini-1.5</td><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">0.0</td><td align="left" valign="top">.25</td><td align="left" valign="top">2.500000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">OpenAI-O3</td><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">1.0</td><td align="left" valign="top">.50</td><td align="left" valign="top">5.000000</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">LLaMA-3.1</td><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">1.0</td><td align="left" valign="top">.65</td><td align="left" valign="top">6.547208</td><td align="left" valign="top">False</td></tr><tr><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">2.0</td><td align="left" valign="top">.75</td><td align="left" valign="top">6.547208</td><td align="left" valign="top">False</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>This table shows pairwise Wilcoxon signed rank test <italic>P</italic> values and their significance across 3 major tasks.</p></fn></table-wrap-foot></table-wrap><p>In addition, to provide a more descriptive analysis of model variability, we computed 95% bootstrap CIs for each model&#x2019;s mean accuracy. As shown in <xref ref-type="table" rid="table3">Table 3</xref>, the model OpenAI-O3 achieved the highest average accuracy (69.33%, 95% CI 45.33-90.0), followed by DeepSeek-R1 (65.33%, 95% CI 40.33-85.0). Although LLaMA-3.1 and ChatGPT-4 had lower means (~56%), their CIs overlapped substantially with those of the higher-performing models. Gemini-1.5 demonstrated the lowest performance (42.22%, 95% CI 14.67-79.0), with a wide CI indicating high variability. Together, these analyses suggest that while OpenAI-O3 and DeepSeek-R1 appear to perform better, the limited number of tasks restricts the ability to draw firm conclusions regarding statistical significance. Future studies with a larger and more diverse task set will help validate these trends with greater statistical certainty.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Bootstrap CIs for each model&#x2019;s mean accuracy<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Mean accuracy (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">OpenAI-O3</td><td align="left" valign="top">68.68 (45.33-90.0)</td></tr><tr><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">66.00 (40.33-85.0)</td></tr><tr><td align="left" valign="top">LLaMA-3.1</td><td align="left" valign="top">56.33 (41.33-85.0)</td></tr><tr><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">55.41 (40.67-84.9)</td></tr><tr><td align="left" valign="top">Gemini-1.5</td><td align="left" valign="top">42.22 (14.67-79.0)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>This table shows the bootstrap CI for each model&#x2019;s mean accuracy with OpenAI-O3 showing the top performance when comparing each model toward aggregated tasks.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Overview</title><p>Our results show that reasoning models outperformed nonreasoning ones across most tasks. OpenAI-O3 showed the highest accuracy for primary diagnosis (n=270, 90%) and <italic>ICD-9</italic> coding (n=136, 45.3%), while DeepSeek-R1 led in readmission prediction (n=218, 72.6%). LLaMA-3.1 was the strongest nonreasoning model but showed lower performance on <italic>ICD-9</italic> and readmission tasks. Although statistical significance was not reached, consistent performance trends of reasoning models suggest practical relevance particularly in clinical settings where even small gains can impact outcomes. Reasoning models also provided more detailed explanations, though their verbosity may hinder usability. No model met clinical standards across all tasks. Future work with more tasks and effect size analyses can better validate these patterns.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>The existing literature presents mixed findings on the capabilities of LLMs in health care tasks such as diagnosis prediction and medical coding. Soroush et al [<xref ref-type="bibr" rid="ref4">4</xref>] report poor performance in medical coding, while Kwan [<xref ref-type="bibr" rid="ref3">3</xref>] showed improved outcomes with augmentation strategies. Lee et al [<xref ref-type="bibr" rid="ref9">9</xref>] emphasize that while LLMs make errors, they also demonstrate potential in identifying them. Zhu et al [<xref ref-type="bibr" rid="ref10">10</xref>] illustrate that incorporating longitudinal health records into prompts enhances predictive accuracy. Zhou et al [<xref ref-type="bibr" rid="ref17">17</xref>] further highlight the value of prompt engineering and fine-tuning with high-quality data for robust diagnostic performance. Nuthakki et al [<xref ref-type="bibr" rid="ref18">18</xref>] demonstrated that domain-specific deep learning models like Universal Language Model Fine-Tuning, when trained on large-scale datasets such as MIMIC-III, can perform well in ICD code prediction tasks, underscoring the contrast between tailored models and general-purpose LLMs evaluated in our study. Recent studies using MIMIC data also reveal some challenges, one found that converting structured data to free text for mortality prediction with zero-shot prompting showed limited accuracy [<xref ref-type="bibr" rid="ref19">19</xref>], while another showed that minor changes like word swaps or misspellings can significantly affect ICD code predictions [<xref ref-type="bibr" rid="ref20">20</xref>]. With these diverse findings in mind, we sought to evaluate the performance of 5 prominent, out-of-the-box LLMs for aggregated high-value health care tasks using a dataset that these LLMs are not already trained on.</p><p>Building on prior work, we used a sample size of 300 deidentified patients from MIMIC-IV [<xref ref-type="bibr" rid="ref21">21</xref>], a larger sample than many previous studies [<xref ref-type="bibr" rid="ref22">22</xref>]. By leveraging sections of patient discharge summaries and focusing on tasks like predicting primary diagnoses, generating <italic>ICD-9</italic> codes, and stratifying hospital readmission risk, we provide new insights into the potential of LLMs to handle aggregated complex clinical tasks using a chatbot interface, without task-specific fine-tuning. Our use of zero-shot prompting, which avoids the need for additional setup or fine-tuning, highlights the practicality and efficiency of these models in real-world health care settings [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref24">24</xref>]. However, we acknowledge that using publicly available chatbot interfaces rather than controlled APIs or locally hosted models creates challenges for reproducing results. This is because the models behind these tools like ChatGPT-4 are regularly updated and improved without fixed subversion numbers that users can select. Even when accessing the models through APIs, it is not possible to lock in a specific subversion [<xref ref-type="bibr" rid="ref25">25</xref>], so outputs can change over time. While this limits strict repeatability, it reflects how most real users interact with these models in practice. Our study prioritizes ecological validity over perfect experimental control. For future research, using open-source models like LLaMA-3.1 or DeepSeek-R1 in local environments could help stabilize versions and settings, making experiments easier to reproduce. Our study offers a baseline to understand their strengths, limitations, ethical considerations, and areas for improvement, ultimately guiding future research in fine-tuning and prompt engineering.</p></sec><sec id="s4-3"><title>Principal Findings</title><p>On evaluating the performance of nonreasoning LLMs for predicting primary diagnoses, LLaMA-3.1 demonstrated improved accuracy, achieving 85% correctness in a zero-shot prompting scenario. While not outstanding, this level of performance demonstrates the model&#x2019;s capability to support clinical decision-making without task-specific fine-tuning. Between the reasoning models, OpenAI-O3 demonstrated higher performance with 90% correctness. Our approach aims to enhance efficiency and decision-making through AI-human collaboration. Additionally, we generated explanations for each prediction in both reasoning and nonreasoning models to ensure transparency in the model&#x2019;s reasoning.</p><p>DeepSeek-R1 achieved the highest performance in readmission risk prediction (n=218, 72.6%), but the result remains suboptimal, likely in part due to variability within the dataset. Our findings on <italic>ICD-9</italic> prediction align with existing literature [<xref ref-type="bibr" rid="ref4">4</xref>], which shows that general-purpose LLMs struggle with this task. While OpenAI-O3 (n=136, 45.3%) outperformed other models in <italic>ICD-9</italic> prediction, its low accuracy and modest <italic>F</italic><sub>1</sub>-score (0.122) highlight the need for improvement, particularly in reducing false positives. This leads us to a central concern with such models, the risk of hallucinations, especially the &#x201C;faithfulness problem,&#x201D; where the model generates nonfactual or unfaithful information [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. In high-stakes clinical tasks like medical coding and readmission risk prediction, such hallucinations may lead to misclassification, potentially resulting in suboptimal or even harmful decisions. Automation bias further compounds this risk, as clinicians may overrely on confident but incorrect model outputs without adequate verification [<xref ref-type="bibr" rid="ref28">28</xref>]. These issues raise important ethical concerns around patient safety, informed oversight, and the responsible deployment of AI in clinical practice. Even minor issues in input, such as word swaps or misspellings in clinical notes, can drastically alter the output [<xref ref-type="bibr" rid="ref20">20</xref>], especially in the absence of standardized language across clinical documentation. Such vulnerabilities undermine reliability and increase the likelihood of misclassification, particularly in tasks like readmission prediction, where both over- and underestimation can have direct consequences on patient outcomes. Addressing these concerns like miscalculatio,n should be a focus of future research, and we believe that this study offers a valuable foundation. Strategies such as real-time monitoring, feedback loops to flag misclassifications, improved explainability of outputs, and training models on these flagged instances can significantly reduce errors. Incorporating human-in-the-loop or hybrid systems that combine LLMs with clinical expertise may also help prevent misclassifications from escalating. Ultimately, models specifically fine-tuned on clinical text datasets have demonstrated better performance in generating relevant ICD codes and reducing human error, contributing to more accurate documentation, improved patient care, and regulatory compliance [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>].</p><p>Another observation was that reasoning models produced more verbose &#x201C;explanations&#x201D; for primary diagnosis and readmission risk than nonreasoning models. Nonreasoning models generated an average of 70 (SD 5.8) words for primary diagnosis explanations and 54 (SD 5.3) words for readmission risk explanations. In contrast, reasoning models like DeepSeek-R1 averaged 418 (SD 56) words for primary diagnosis explanations and 612 (SD 23) words for readmission risk explanations. OpenAI-O3 generated an average of 713 (SD 30) words for primary diagnosis explanations and 1112 (SD 23) words for readmission risk explanations. While transparency and explanation are essential for clinical trust, excessively long responses may increase cognitive load and hinder real-time decision-making, especially for clinicians operating under time constraints. Prior studies have shown that clinicians favor concise, targeted decision support over lengthy narratives, particularly in high-pressure settings [<xref ref-type="bibr" rid="ref31">31</xref>]. Our findings highlight a trade-off between interpretability and usability [<xref ref-type="bibr" rid="ref32">32</xref>]. Although we did not include direct feedback from clinicians, future research should incorporate user-centered evaluation metrics such as response usefulness, reading time, and trust perception to better understand how explanation length influences adoption and workflow integration. Tailoring model output length and clarity through prompt design may improve practical adoption and can help strike a balance between clarity and efficiency.</p></sec><sec id="s4-4"><title>Strengths and Limitations</title><p>Our study used a deidentified dataset to protect patient privacy and confidentiality. However, from an ethical and operational perspective, deploying LLMs in real health care systems raises pressing questions. These include how to protect patient privacy, ensure informed consent, and avoid automation bias or overreliance on potentially hallucinated or unvalidated outputs. Automation bias can lead clinicians to accept AI-generated suggestions without sufficient scrutiny, particularly concerning when LLMs hallucinate plausible-sounding but incorrect diagnoses or codes [<xref ref-type="bibr" rid="ref28">28</xref>]. Recent findings also show that LLMs like GPT-4 fail to adequately represent demographic diversity in clinical scenarios, often reinforcing stereotypes in race- and gender-based presentations of disease [<xref ref-type="bibr" rid="ref33">33</xref>]. A related phenomenon, &#x201C;shortcut learning,&#x201D; where AI models may rely on spurious features rather than true clinical signals, further complicates these issues, generating biased outcomes even when protected attributes are not explicitly used as inputs [<xref ref-type="bibr" rid="ref34">34</xref>]. Shortcut learning introduces various biases across different phases of AI development, including data bias, modeling bias, and inference bias [<xref ref-type="bibr" rid="ref34">34</xref>]. Resolving these ethical challenges requires a multifaceted approach: establishing transparent model auditing processes, enforcing rigorous data governance policies, clinician-in-the-loop frameworks, and ensuring that patients and clinicians are adequately informed about the use and limitations of AI tools. Effective deployment of fairness assessments requires comprehensive bias audits across demographic subgroups, transparent model evaluation, and active mitigation strategies by deploying bias mitigation tools [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. Emerging legal frameworks emphasize accountability for biased AI models, underscoring the necessity for comprehensive fairness assessments [<xref ref-type="bibr" rid="ref34">34</xref>]. Interdisciplinary collaboration among ethicists, clinicians, and AI developers will be essential to ensure that these tools are not only technically effective but also fair, trustworthy, and aligned with clinical standards [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Future work should focus on further evaluating these biases by leveraging emerging bias detection tools, refining existing mitigation strategies, and developing accessible, domain-specific frameworks tailored for clinical use.</p><p>Additionally, models like Gemini-1.5 showed a safety-first behavior with the response of &#x201C;Call or text 988 for support&#x201D; when prompted with scenarios involving psychiatric information. While ethically commendable, this may limit utility in some care contexts. This reveals a deeper tension between safety safeguards and task performance that future models must navigate. The challenge lies in balancing the model&#x2019;s need to err on the side of caution to avoid harm while ensuring it provides relevant and actionable insights for health care professionals. Addressing this issue can perhaps be done through the development of more context-sensitive responses or clinician-in-the-loop models that can help mitigate this tradeoff.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study provides a nuanced understanding of the strengths and limitations of LLMs in health care tasks using zero-shot diagnostic prompting. While none of the models met clinical performance thresholds out of the box, their varied capabilities, particularly LLaMA-3.1&#x2019;s consistent performance among nonreasoning models and OpenAI-O3&#x2019;s strength across reasoning tasks, underscore the potential for leveraging LLMs in clinical workflows with minimal setup. However, the reliance on the MIMIC-IV dataset, which reflects a single-center and deidentified hospital population, may limit the generalizability of these findings to broader or more diverse health care settings.</p><p>These results reinforce the need for further adaptation of LLMs through domain-specific training, enhanced data preprocessing (eg, standardizing clinical note structures), and fine-tuning with clinical datasets to improve contextual understanding and minimize hallucinations. Incorporating a real-time flagging system and clinician-in-the-loop frameworks could also enhance safety, usability, and trust. Future work will focus on refining models for hospital readmission risk prediction, evaluating their reasoning quality, and exploring hybrid systems that combine LLM outputs with expert oversight to better align with clinical standards and support reliable decision-making. The limitations identified in this study serve as critical guideposts for shaping future research, ultimately moving the field closer to the safe and effective clinical integration of LLMs.</p></sec></sec></body><back><ack><p>JWG is a 2022 Robert Wood Johnson Foundation Harold Amos Medical Faculty Development Program and declares support from Lacuna Fund (#67), Gordon and Betty Moore Foundation, National Institutes of Health (National Institute of Biomedical Imaging and Bioengineering)-Medical Imaging and Data Resource Center (grant under contracts 75N92020C00008 and 75N92020C00021), and National Heart, Lung, and Blood Institute (award R01HL167811). SP and JWG received support from the National Institutes of Health common fund (award 1R25OD039834-01). PN discloses that this study was conducted independently and is not related to work at ConcertAI, where she is employed. PN notes that her participation in this study was performed on her personal capacity and was not funded.</p></ack><notes><sec><title>Data Availability</title><p>The datasets analyzed during this study (Medical Information Mart for Intensive Care-IV) are not publicly available due to controlled access requirements and patient privacy protections that mandate completion of CITI training for researcher access, but the processed datasets and subject IDs used in this study are available from the corresponding author on reasonable request. All subject_ids used in this study are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. All the scripts and queries used for data extraction, analysis, and visualization are shared via a GitHub repository [<xref ref-type="bibr" rid="ref37">37</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>PN conceptualized the study and contributed to project administration. PN, RM, JWG, and SP conducted the formal analysis and visualization. JWG secured funding for the study and provided supervision along with SP. SP contributed to project administration and supervision. All authors had access to the data, have read and approved the final manuscript, accepted responsibility for the decision to submit it for publication, and have verified the data.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3"><italic>ICD-10</italic></term><def><p><italic>International Classification of Diseases, Tenth Revision</italic></p></def></def-item><def-item><term id="abb4"><italic>ICD-9</italic></term><def><p><italic>International Classification of Diseases, Ninth Revision</italic></p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MIMIC</term><def><p>Medical Information Mart for Intensive Care</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Minaee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mikolov</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nikzad</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models: a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 23, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.06196</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>N</given-names> </name></person-group><article-title>Large language models in health care: development, applications, and challenges</article-title><source>Health Care Sci</source><year>2023</year><month>08</month><volume>2</volume><issue>4</issue><fpage>255</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1002/hcs2.61</pub-id><pub-id pub-id-type="medline">38939520</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kwan</surname><given-names>K</given-names> </name></person-group><article-title>Large language models are good medical coders, if provided with tools</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2407.12849</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soroush</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Zimlichman</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Large language models are poor medical coders&#x2014;benchmarking of medical code querying</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>25</day><volume>1</volume><issue>5</issue><fpage>AIdbp2300040</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300040</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cooke</surname><given-names>DT</given-names> </name><name name-style="western"><surname>Gelfand</surname><given-names>GAJ</given-names> </name><name name-style="western"><surname>Broghammer</surname><given-names>JA</given-names> </name></person-group><article-title>Billing, coding, and credentialing in the thoracic surgery practice</article-title><source>Thorac Surg Clin</source><year>2011</year><month>08</month><volume>21</volume><issue>3</issue><fpage>349</fpage><lpage>358</lpage><pub-id pub-id-type="doi">10.1016/j.thorsurg.2011.04.003</pub-id><pub-id pub-id-type="medline">21762858</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>Diagnostic code descriptions: ICD-9</article-title><source>Government of British Columbia</source><access-date>2024-11-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www2.gov.bc.ca/gov/content/health/practitioner-professional-resources/msp/physicians/diagnostic-code-descriptions-icd-9">https://www2.gov.bc.ca/gov/content/health/practitioner-professional-resources/msp/physicians/diagnostic-code-descriptions-icd-9</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bulgarelli</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Horng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Celi</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Mark</surname><given-names>R</given-names> </name></person-group><article-title>MIMIC-IV (version 2.0)</article-title><source>PhysioNet</source><year>2022</year><access-date>2025-07-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://physionet.org/content/mimiciv/2.0/">https://physionet.org/content/mimiciv/2.0/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bubeck</surname><given-names>S</given-names> </name><name name-style="western"><surname>Petro</surname><given-names>J</given-names> </name></person-group><article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title><source>N Engl J Med</source><year>2023</year><month>03</month><day>30</day><volume>388</volume><issue>13</issue><fpage>1233</fpage><lpage>1239</lpage><pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id><pub-id pub-id-type="medline">36988602</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Prompting large language models for zero-shot clinical prediction with structured longitudinal electronic health record data</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 10, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.01713</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>W</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>LLMs for doctors: leveraging medical LLMs to assist doctors, not replace them</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 26, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.18034</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bulgarelli</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>T</given-names> </name><etal/></person-group><article-title>MIMIC-IV (version 3.1)</article-title><source>PhysioNet</source><year>2024</year><access-date>2025-07-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://physionet.org/content/mimiciv/3.1/">https://physionet.org/content/mimiciv/3.1/</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Aali</surname><given-names>A</given-names> </name><name name-style="western"><surname>Veen</surname><given-names>DV</given-names> </name><name name-style="western"><surname>Arefeen</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>MIMIC-IV-Ext-BHC: labeled clinical notes dataset for hospital course summarization (version 1.1.0)</article-title><source>PhysioNet</source><year>2024</year><access-date>2025-07-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://physionet.org/content/labelled-notes-hospital-course/1.2.0/">https://physionet.org/content/labelled-notes-hospital-course/1.2.0/</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Beltagy</surname><given-names>I</given-names> </name><name name-style="western"><surname>Lo</surname><given-names>K</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Inui</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>V</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>X</given-names> </name></person-group><article-title>SciBERT: a pretrained language model for scientific text</article-title><conf-name>Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP)</conf-name><conf-date>2019</conf-date><conf-loc>Hong Kong, China</conf-loc><fpage>3615</fpage><lpage>3620</lpage><pub-id pub-id-type="doi">10.18653/v1/D19-1371</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="web"><article-title>scibert&#x2011;scivocab&#x2011;uncased model</article-title><source>PromptLayer</source><year>2025</year><access-date>2024-11-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.promptlayer.com/models/scibertscivocabuncased">https://www.promptlayer.com/models/scibertscivocabuncased</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Unified Medical Language System (UMLS)</article-title><source>National Library of Medicine</source><access-date>2024-12-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nlm.nih.gov/research/umls/index.html">https://www.nlm.nih.gov/research/umls/index.html</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models for disease diagnosis: a scoping review</article-title><source>npj Artif Intell</source><volume>1</volume><issue>1</issue><fpage>9</fpage><pub-id pub-id-type="doi">10.1038/s44387-025-00011-z</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nuthakki</surname><given-names>S</given-names> </name><name name-style="western"><surname>Neela</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gichoya</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Purkayastha</surname><given-names>S</given-names> </name></person-group><article-title>Natural language processing of MIMIC-III clinical notes for identifying diagnosis and procedures with neural networks</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 28, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1912.12397</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lovon-Melgarejo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ben-Haddi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Scala</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moreno</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Tamine</surname><given-names>L</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Thompson</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ondov</surname><given-names>B</given-names> </name></person-group><article-title>Revisiting the MIMIC-IV benchmark: experiments using language models for electronic health records</article-title><conf-name>Proceedings of the First Workshop on Patient-Oriented Language Processing (CL4Health) @ LREC-COLING 2024</conf-name><conf-date>May 20, 2024</conf-date><conf-loc>Torino, Italy</conf-loc><fpage>189</fpage><lpage>196</lpage></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jaganathan</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Kahanda</surname><given-names>I</given-names> </name><name name-style="western"><surname>Kanewala</surname><given-names>U</given-names> </name></person-group><article-title>Metamorphic testing for robustness and fairness evaluation of LLM-based automated ICD coding applications</article-title><source>Smart Health</source><year>2025</year><month>06</month><volume>36</volume><fpage>100564</fpage><pub-id pub-id-type="doi">10.1016/j.smhl.2025.100564</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Bulgarelli</surname><given-names>L</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-IV, a freely accessible electronic health record dataset</article-title><source>Sci Data</source><year>2023</year><month>01</month><day>3</day><volume>10</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1038/s41597-022-01899-x</pub-id><pub-id pub-id-type="medline">36596836</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chiu</surname><given-names>WHK</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>WSK</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>WCS</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>SYJ</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>WCL</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>MD</given-names> </name></person-group><article-title>Evaluating the diagnostic performance of large language models on complex multimodal medical cases</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>13</day><volume>26</volume><fpage>e53724</fpage><pub-id pub-id-type="doi">10.2196/53724</pub-id><pub-id pub-id-type="medline">38739441</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Zero-shot prompting</article-title><source>DataCamp</source><access-date>2024-12-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.datacamp.com/tutorial/zero-shot-prompting">https://www.datacamp.com/tutorial/zero-shot-prompting</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Zero-shot prompting: a benchmarking framework for AI prompts</article-title><source>Symbio6</source><access-date>2024-12-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://symbio6.nl/en/blog/zero-shot-prompting-benchmarking">https://symbio6.nl/en/blog/zero-shot-prompting-benchmarking</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>Does GPT-35 or GPT-4 API get minor updates?</article-title><source>OpenAI Developer Community</source><year>2024</year><access-date>2025-05-25</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://community.openai.com/t/does-gpt-3-5-or-gpt-4-api-get-minor-updates/380885">https://community.openai.com/t/does-gpt-3-5-or-gpt-4-api-get-minor-updates/380885</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Schenck</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name></person-group><article-title>Faithful AI in medicine: a systematic review with large language models and beyond</article-title><source>medRxiv</source><comment>Preprint posted online on  Jul 1, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.04.18.23288752</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>H</given-names> </name></person-group><article-title>Faithfulness in natural language generation: a systematic survey of analysis, evaluation, and optimization methods</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 10, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.05227</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Cascella</surname><given-names>LM</given-names> </name></person-group><article-title>Artificial intelligence risks: automation bias</article-title><source>MedPro Group</source><year>2023</year><access-date>2024-12-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.medpro.com/artificial-intelligence-risks-automationbias">https://www.medpro.com/artificial-intelligence-risks-automationbias</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Carberry</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name></person-group><article-title>A hierarchical fine-grained deep learning model for automated medical coding</article-title><conf-name>2024 IEEE 3rd International Conference on Computing and Machine Intelligence (ICMI)</conf-name><conf-date>Apr 13-14, 2024</conf-date><conf-loc>Mt Pleasant, MI, United States</conf-loc><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.1109/ICMI60790.2024.10585710</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Caralt</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>CBL</given-names> </name><name name-style="western"><surname>Rei</surname><given-names>M</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Demner-Fushman</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Miwa</surname><given-names>M</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tsujii</surname><given-names>J</given-names> </name></person-group><article-title>Continuous predictive modeling of clinical notes and ICD codes in patient health records</article-title><conf-name>Proceedings of the 23rd Workshop on Biomedical Natural Language Processing</conf-name><conf-date>Aug 16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><pub-id pub-id-type="doi">10.18653/v1/2024.bionlp-1.19</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ely</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Osheroff</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Chambliss</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Ebell</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Rosenbaum</surname><given-names>ME</given-names> </name></person-group><article-title>Answering physicians&#x2019; clinical questions: obstacles and potential solutions</article-title><source>J Am Med Inform Assoc</source><year>2005</year><volume>12</volume><issue>2</issue><fpage>217</fpage><lpage>224</lpage><pub-id pub-id-type="doi">10.1197/jamia.M1608</pub-id><pub-id pub-id-type="medline">15561792</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tambwekar</surname><given-names>P</given-names> </name><name name-style="western"><surname>Gombolay</surname><given-names>M</given-names> </name></person-group><article-title>Towards reconciling usability and usefulness of explainable AI methodologies</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 13, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2301.05347</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lehman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Suzgun</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study</article-title><source>Lancet Digit Health</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e12</fpage><lpage>e22</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00225-X</pub-id><pub-id pub-id-type="medline">38123252</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name><name name-style="western"><surname>Bhattacharjee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Burns</surname><given-names>JL</given-names> </name><etal/></person-group><article-title>&#x201C;Shortcuts&#x201D; causing bias in radiology artificial intelligence: causes, evaluation, and mitigation</article-title><source>J Am Coll Radiol</source><year>2023</year><month>09</month><volume>20</volume><issue>9</issue><fpage>842</fpage><lpage>851</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2023.06.025</pub-id><pub-id pub-id-type="medline">37506964</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehrabi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Morstatter</surname><given-names>F</given-names> </name><name name-style="western"><surname>Saxena</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lerman</surname><given-names>K</given-names> </name><name name-style="western"><surname>Galstyan</surname><given-names>A</given-names> </name></person-group><article-title>A survey on bias and fairness in machine learning</article-title><source>ACM Comput Surv</source><year>2022</year><month>07</month><day>31</day><volume>54</volume><issue>6</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3457607</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Price</surname><given-names>WN</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>IG</given-names> </name></person-group><article-title>Privacy in the age of medical big data</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>37</fpage><lpage>43</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0272-7</pub-id><pub-id pub-id-type="medline">30617331</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>pnaliyatthaliyazchayil</article-title><source>GitHub</source><access-date>2025-07-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/pnaliyatthaliyazchayil/evaluate_chatbot_llms_for_healthcare">https://github.com/pnaliyatthaliyazchayil/evaluate_chatbot_llms_for_healthcare</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional information.</p><media xlink:href="jmir_v27i1e74142_app1.docx" xlink:title="DOCX File, 149 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Nonreasoning large language models (LLMs) and Medical Information Mart for Intensive Care (MIMIC)-IV sample top 10 <italic>ICD-9</italic> (<italic>International Classification of Diseases, Ninth Revision</italic>) codes. This figure shows the top 10 <italic>ICD-9</italic> codes from MIMIC-IV sample and the 3 nonreasoning LLMs. Such graphs can help us show patterns. Here, we see a pattern of ischemic heart diseases showing in the MIMIC-IV sample and LLM, whereas the category of general symptoms was only seen in all 3 LLMs and not MIMIC-IV, showing that it might be an area for scope of improvement. Diabetes mellitus is seen in MIMIC-IV sample, LLaMA-3.1, and ChatGPT-4 and not in Gemini-1.5, which aligns with our findings of <italic>ICD-9</italic> code predictions where Gemini-1.5 underperformed.</p><media xlink:href="jmir_v27i1e74142_app2.png" xlink:title="PNG File, 422 KB"/></supplementary-material></app-group></back></article>