<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e87802</article-id><article-id pub-id-type="doi">10.2196/87802</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance Evaluation of GPT-5, Grok 4, and DeepSeek R1 in Interpreting Complete Blood Count Reports for Hematologic Diseases: Retrospective Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Ye</surname><given-names>Xianfei</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Qi</surname><given-names>Xinglun</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fan</surname><given-names>Lina</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yu</surname><given-names>Qian</given-names></name><degrees>BMed</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Suming</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ren</surname><given-names>Chunyun</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yang</surname><given-names>Dagan</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Laboratory Medicine, The First Affiliated Hospital, Zhejiang University School of Medicine</institution><addr-line>79 Qingchun Road</addr-line><addr-line>Hangzhou</addr-line><addr-line>Zhejiang</addr-line><country>China</country></aff><aff id="aff2"><institution>Key Laboratory of Clinical In Vitro Diagnostic Techniques of Zhejiang Province</institution><addr-line>Hangzhou</addr-line><country>China</country></aff><aff id="aff3"><institution>Institute of Laboratory Medicine, Zhejiang University</institution><addr-line>Hangzhou</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Abou-Bakr</surname><given-names>Asmaa</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Yazici</surname><given-names>Ramiz</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Cerqueira</surname><given-names>Renato</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Dagan Yang, MS, Department of Laboratory Medicine, The First Affiliated Hospital, Zhejiang University School of Medicine, 79 Qingchun Road, Hangzhou, Zhejiang, China, 86 0571-87236383; <email>yangdagan@zju.edu.cn</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>5</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e87802</elocation-id><history><date date-type="received"><day>14</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>06</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>06</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Xianfei Ye, Xinglun Qi, Lina Fan, Qian Yu, Suming Zhou, Chunyun Ren, Dagan Yang. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 5.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e87802"/><abstract><sec><title>Background</title><p>Large language models (LLMs) demonstrate potential in the laboratory, yet rigorous clinical evaluation remains limited. The opacity of LLM decision-making constrains their safe application in interpreting complete blood count (CBC) reports for hematologic diseases.</p></sec><sec><title>Objective</title><p>This study aimed to conduct an exploratory evaluation of GPT-5, Grok 4, and DeepSeek R1 in interpreting real-world CBC reports, particularly their reasoning capabilities and clinical safety.</p></sec><sec sec-type="methods"><title>Methods</title><p>This single-center retrospective study analyzed 100 CBC reports from initial-visit patients with hematologic conditions. After responses were generated by the 3 LLMs using standardized Chinese prompts, four trained laboratory physicians blindly evaluated them across 6 quality and 5 task dimensions. Interrater reliability was assessed using intraclass correlation coefficients (ICCs), and performance differences were assessed based on 4-rater consensus scores and Friedman and Wilcoxon tests. For task 4 (ablation analysis), the McNemar test was used to compare top-1 diagnostic concordance with the gold-standard diagnosis within each model, with and without initial clinical suspicion in the prompt. Error types and distributions were documented during the task evaluation.</p></sec><sec sec-type="results"><title>Results</title><p>DeepSeek R1 demonstrated excellent interrater reliability across most quality dimensions (ICC &#x2265;0.75). In the quality dimension, DeepSeek R1 significantly outperformed the other models in comprehensiveness, accuracy, clarity, relevance, and practicality. In the task 4 evaluation, GPT-5 demonstrated the highest concordance (93/100, 93%) with gold-standard diagnoses, followed by DeepSeek R1 (92/100, 92%) and Grok 4 (89/100, 89%). After removing the initial clinical suspicion, these rates decreased to 79% (79/100), 77% (77/100), and 72% (72/100), representing statistically significant within-model reductions for all models (<italic>P</italic>&#x003C;.001). Post hoc error analysis revealed distinct patterns across task dimensions. GPT-5 exhibited 12 hallucinations in the analyzer alert processing task; DeepSeek R1 demonstrated 1 hallucination in the abnormal item identification task, whereas Grok 4 displayed none. All models exhibited reasoning errors and varying degrees of deficiencies in the correlation analysis and preliminary diagnosis tasks, characterized by unwarranted inferences of disease status from isolated results without clinical integration. Grok 4 generated 9 reasoning errors in the clinical management task by providing generic recommendations not tailored to case-specific CBC data, potentially compromising individualized treatment decisions.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>While current LLMs demonstrate potential for interpreting CBC reports in hematologic diseases, they show performance heterogeneity across models. The ablation study findings underscore the necessity of integrating clinical context for accurate laboratory test interpretation. Low scores, hallucinations, and reasoning errors in model outputs indicate that current clinical deployment requires human oversight and quality control. As this single-center, Chinese-language exploratory assessment provides only preliminary, possibly context-dependent evidence, multicenter, cross-lingual prospective validation is needed to delineate the practical boundaries and safety standards for clinical deployment.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>hematologic diseases</kwd><kwd>ChatGPT</kwd><kwd>Grok</kwd><kwd>DeepSeek</kwd><kwd>hallucination</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The rapid advancement of next-generation information technologies has enabled large language models (LLMs), exemplified by ChatGPT (OpenAI), to demonstrate unprecedented capabilities in natural language processing, logical reasoning, and content generation [<xref ref-type="bibr" rid="ref1">1</xref>]. In laboratory medicine, LLMs have demonstrated potential utility in multiple contexts, including laboratory test ordering [<xref ref-type="bibr" rid="ref2">2</xref>], report interpretation [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], intelligent question answering [<xref ref-type="bibr" rid="ref5">5</xref>], qualification examinations [<xref ref-type="bibr" rid="ref6">6</xref>], laboratory management [<xref ref-type="bibr" rid="ref7">7</xref>], and clinical decision support [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>However, existing evaluations of LLMs predominantly rely on public or simulated datasets, with limited rigorous assessments in authentic clinical environments [<xref ref-type="bibr" rid="ref9">9</xref>]. Critical challenges&#x2014;including model hallucinations, opaque decision-making processes, and inadequate evaluation frameworks&#x2014;severely constrain the safe deployment and widespread adoption of LLMs in clinical practice [<xref ref-type="bibr" rid="ref10">10</xref>]. In response, guidelines such as TRIPOD-LLM and the Chatbot Assessment Reporting Tool (CHART) have been introduced, emphasizing the necessity of conducting systematic assessments of clinical artificial intelligence (AI) tools that are multidimensional, interpretable, and transparent [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>The interpretation of complete blood count (CBC) results represents a key application for hematologic disease screening. As a primary diagnostic tool, CBC analysis provides essential clues for disease identification and differential diagnosis. When combined with clinical data from electronic health records (EHRs), CBC information from laboratory analyzers constitutes the foundation for diagnosing hematologic disorders. This process relies heavily on physician experience, leading to subjectivity, high workloads, and a lack of standardization. Traditional machine learning models have shown potential&#x2014;such as CatBoost models for thalassemia identification [<xref ref-type="bibr" rid="ref13">13</xref>], extreme gradient boosting (XGBoost)&#x2013;based classifiers for acute leukemia subtypes [<xref ref-type="bibr" rid="ref14">14</xref>], and support vector machine or artificial neural network models for acute leukemia diagnosis [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. However, these models often suffer from limitations including limited interpretability, weak cross-disease generalization, an inability to integrate unstructured clinical narratives, and a lack of interactive explanations [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>].</p><p>In contrast, LLMs leverage robust contextual understanding and chain-of-thought reasoning to identify abnormal values, analyze parameter correlations, simulate clinical reasoning, and generate diagnostic recommendations [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Notably, the DeepSeek (High-Flyer) model has been deployed in nearly 1000 Chinese hospitals [<xref ref-type="bibr" rid="ref21">21</xref>], and its report interpretation capabilities have been piloted for laboratory result verification, clinical consultation, and patient communication.</p><p>Despite these advances, clinical laboratories currently lack evidence-based criteria for selecting LLMs, comparative evaluations using real-world clinical data, and systematic analyses of critical safety issues such as model hallucinations. This study addresses these limitations by comprehensively evaluating 3 advanced LLMs&#x2014;DeepSeek R1, Grok 4 (SpaceXAI), and GPT-5&#x2014;using real-world clinical CBC data across 6 quality and 5 task dimensions, with particular attention to reasoning capabilities and clinical safety. Our findings aim to provide empirical evidence and practical guidance for developing more reliable and safer AI-assisted interpretation systems for hematologic disease reports.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>We used a comprehensive framework to systematically evaluate the performance of 3 LLMs in interpreting CBC reports for hematologic disorders. The workflow consists of four key phases: (1) selection of target CBC reports for hematologic disorders; (2) submission of structured prompts to 3 LLMs; (3) evaluation across 6 quality dimensions&#x2014;comprehensiveness, accuracy, clarity, relevance, practicality, and safety; and (4) evaluation across 5 task dimensions, reflecting clinical capabilities in analyzer alert processing, abnormal item identification, correlation analysis of abnormal items, preliminary diagnosis, and clinical management.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study analyzed a set of EHRs obtained from the laboratory department of The First Affiliated Hospital, Zhejiang University School of Medicine (FAHZU) between June 1, 2025, and July 7, 2025. The study protocol was approved by the institutional review board of FAHZU (IIT2025B-0629). Prior to data extraction, all records were fully deidentified by removing all direct identifiers (eg, name, date of birth, medical record number, contact information, and clinician details) and quasi-identifiers (eg, specific dates, locations, and institutional identifiers). Only data relevant to the study objectives were retained, including patient demographics, chief complaint, symptoms, physical examination findings, initial clinical suspicion, CBC reports, and alert messages generated by hematology analyzers. As the research involved a retrospective analysis of fully deidentified EHR data, the requirement for informed consent was waived by the institutional review board. No participants were contacted, and no compensation was provided.</p></sec><sec id="s2-3"><title>Selection of Clinical Case Reports</title><p>This study retrospectively screened patients with hematologic conditions presenting for their initial visit to the 4 campuses of FAHZU (Yuhang, Qingchun, Zhijiang, and Chengzhan) between June 1, 2025, and July 7, 2025, yielding an initial cohort of 449 cases. From this cohort, we selected cases based on characteristic abnormal patterns in CBC reports that could provide diagnostic clues. Cases with normal CBC results or with nonspecific or subtle presentations (eg, those seen in early lymphoma, multiple myeloma, or coagulation disorders) were excluded. Ultimately, 49% (220/449) of the cases were included. The final diagnoses in this dataset were categorized into 4 main groups as follows:</p><p>Category 1 included myeloproliferative neoplasms, characterized by the persistent clonal proliferation of specific cell lineages, such as marked leukocytosis and a full myeloid spectrum in chronic myeloid leukemia; persistently elevated hemoglobin and hematocrit in polycythemia vera; significantly increased platelet counts in essential thrombocythemia; and a characteristic leukoerythroblastic presentation in myelofibrosis.</p><p>Category 2 included acute leukemias and myelodysplastic syndromes, defined by blood cell count abnormalities and the presence of immature cells. This category includes acute myeloid leukemia and acute lymphoblastic leukemia&#x2014;both marked by blasts&#x2014;as well as high-risk myelodysplastic syndromes, which typically present as persistent, unexplained bicytopenia or pancytopenia, often accompanied by blasts.</p><p>Category 3 included cytopenic disorders, defined by reduced counts in one or more blood cell lineages. This category includes aplastic anemia with pancytopenia, immune thrombocytopenia with isolated thrombocytopenia, and various types of anemia characterized by specific red blood cell indices. These include microcytic hypochromic anemia in iron deficiency, macrocytic anemia in megaloblastic anemia, and microcytic hypochromic anemia in thalassemia.</p><p>Category 4 included lymphoproliferative neoplasms, characterized by clonal quantitative abnormalities in lymphocytes or plasma cells, including chronic lymphocytic leukemia with sustained absolute lymphocytosis, lymphoma with abnormal lymphocytes, and multiple myeloma, in which circulating plasma cells can be detected in some cases.</p><p>To achieve a balance between sample representativeness and evaluation workload, 100 cases were selected as the final evaluation cohort from the 220-case dataset through stratified random sampling, with disease category serving as the stratification variable.</p></sec><sec id="s2-4"><title>Selection of LLMs</title><p>Three representative LLMs were selected for evaluation in this study: the open-source DeepSeek R1 (released May 28, 2025) and the closed-source models Grok 4 (released July 10, 2025) and GPT-5 (released August 8, 2025). The model strings used were DeepSeek-R1-0528, Grok 4, and GPT-5, respectively. All models were accessed via application programming interfaces using standardized prompts, with testing conducted on August 11, 2025. To ensure consistency and reproducibility, the generation parameters were fixed at a temperature of 0.3 and a top-p value of 1.0. None of the models underwent domain-specific fine-tuning; this study was designed to evaluate their out-of-the-box performance.</p></sec><sec id="s2-5"><title>Querying LLMs</title><p>To evaluate the capabilities of LLMs in interpreting CBC reports for patients with hematologic conditions, a standardized prompt was designed. This prompt explicitly instructed the models to act as an &#x201C;experienced laboratory medicine expert&#x201D; and provide a professional interpretation based on the provided information. Input data encompassed patient demographics and key clinical texts from EHRs, specifically the chief complaint, physical examination findings, and initial clinical suspicion. All clinical text inputs were used in their original, unprocessed free-text format. These inputs were integrated with CBC reports containing all numerical parameters, reference intervals, and analyzer alerts. The final gold-standard diagnoses for all cases were established by clinicians according to the World Health Organization Classification (5th edition) criteria [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>The models were required to address the following 5 tasks in sequence:</p><list list-type="order"><list-item><p>Analyzer alert processing. Interpret the meaning of all alert flags, assess their potential impact on result reliability, and provide specific recommendations for subsequent actions (eg, blood smear review).</p></list-item><list-item><p>Abnormal item identification. List all out-of-range parameters in a structured format, including values, direction of change, and brief clinical significance.</p></list-item><list-item><p>Correlation analysis of abnormal items. Analyze potential pathophysiological relationships among the abnormal indicators, incorporating patient demographics.</p></list-item><list-item><p>Preliminary diagnosis. Propose 1 to 3 of the most likely preliminary diagnoses or differential diagnoses</p></list-item><list-item><p>Clinical management. Provide specific, actionable suggestions regarding urgent interventions, additional tests, and follow-up.</p></list-item></list><p>Regarding output specifications, the models were required to respond in professional and concise Chinese, strictly follow the numbered sequence (1-5), and limit the total word count to 500 words. Any form of disclaimer was explicitly prohibited to prompt the model to make the most probable judgment. Each case query was processed in a new, isolated conversation session to prevent contextual interference between cases.</p><p>Additionally, to evaluate the independent hematologic reasoning capability of LLMs, we conducted an ablation study for task 4 by removing the initial clinical suspicion from the prompt while retaining all other patient information. Each case was thus processed under 2 conditions&#x2014;with and without clinical suspicion&#x2014;enabling comparison of model performance under full and blinded clinical contexts.</p></sec><sec id="s2-6"><title>Evaluation of LLM Outputs</title><p>Through random selection from all eligible personnel at the 4 campuses, we recruited 2 junior evaluators (each with 5 years of experience) and 2 senior evaluators (each with &#x003E;10 years of experience). Prior to formal evaluation, all evaluators completed a standardized training program to ensure consistent application of the evaluation criteria. The training encompassed clinical practice guidelines, authoritative medical literature, and clinical experience. It also included detailed explanations of scoring dimensions, illustrative case demonstrations, and a calibration exercise involving 20 reports. This exercise was repeated until consensus was reached and a Cohen kappa (&#x03BA;) coefficient of 0.7 or above was achieved.</p><p>All model outputs were standardized before evaluator review. Only plain-text final outputs were presented, and visible reasoning traces or reasoning markers were removed when present. This removal was applied only to DeepSeek R1 outputs, as the GPT-5 and Grok 4 application programming interfaces responses did not contain visible reasoning traces. The standardized outputs were then anonymized, stripped of model identifiers, and presented in randomized order to minimize evaluator recognition based on formatting or stylistic cues.</p><p>We used 2 distinct evaluation checklists. For the quality dimensions, a scoring checklist based on a 5-point Likert scale was used (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For the task dimensions, a 5-point deduction rubric was tailored to each of the 5 tasks (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). The task-specific deduction rubric served as a guide for assigning Likert scores across the 6 quality dimensions for each task output. In cases of ambiguity, evaluators consulted relevant guidelines or literature for clarification. This evaluation generated a total of 36,000 independent quality dimension ratings, calculated as follows: 100 cases&#x00D7;3 models&#x00D7;4 evaluators&#x00D7;5 tasks&#x00D7;6 dimensions. For both the quality and task dimensions, we applied a pragmatic consensus-scoring rule, where the final consensus score was the average of the 4 evaluator ratings. In addition, the raw individual ratings were retained to facilitate distributional visualizations of rating patterns and to analyze low-score assignments by task, model, and evaluator seniority.</p><p>We assessed concordance between the LLMs&#x2019; preliminary diagnoses in task 4 and the initial clinical suspicion (under full-context conditions), as well as the final gold-standard diagnosis (under full-context and ablation conditions). For each model, paired 2&#x00D7;2 contingency tables were constructed based on whether the top-1 suggestion was concordant or discordant with the final gold-standard diagnosis under the 2 conditions. In addition, we classified errors identified during evaluation as either &#x201C;hallucinations&#x201D; (factually fabricated information) or &#x201C;reasoning errors&#x201D; (inferences lacking adequate clinical justification). To further emphasize clinical safety, evaluators were advised to assign a score no higher than 3 to any response containing either type of error, and all such cases were systematically recorded and reviewed in a post hoc analysis.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>All statistical analyses were conducted using R software (version 4.3.1; R Foundation for Statistical Computing). The reliability of the averaged ratings within the junior and senior evaluator groups was estimated using the 2-way random-effects intraclass correlation coefficient (ICC) for the mean of k=2 raters (ICC [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]). This coefficient reflects the reliability of the mean rating derived from 2 randomly selected raters. ICC values were reported with 95% CIs and interpreted according to Cicchetti criteria [<xref ref-type="bibr" rid="ref23">23</xref>]: poor (&#x003C;0.40), fair (0.40&#x2010;0.59), good (0.60&#x2010;0.74), and excellent (&#x2265;0.75). Additionally, we calculated objective concordance metrics for task 4, including top-1 concordance with the initial clinical suspicion and top-1 concordance with the final gold-standard diagnosis under full-context and ablation conditions. To evaluate whether removal of initial clinical suspicion significantly changed diagnostic accuracy within each individual model, we compared the paired binary outcomes (concordant vs discordant with the gold-standard diagnosis) between the 2 prompt conditions using the McNemar test.</p><p>For descriptive reporting of model performance across the 6 quality dimensions, the final 4-rater consensus scores were summarized as medians and IQRs. For task-level performance, each task output was comprehensively evaluated across the 6 quality dimensions, and a task-level score was calculated by averaging the 6 dimension scores assigned by the 4 evaluators for comparative analysis. Differences among the 3 LLMs in these scores were analyzed with the Friedman test; where significant, pairwise post hoc comparisons were conducted using the Wilcoxon signed-rank test with Holm-Bonferroni correction for multiple comparisons. All tests were 2-sided, with statistical significance set at <italic>P</italic>&#x003C;.05.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>General Characteristics</title><p>As illustrated in the study design workflow (<xref ref-type="fig" rid="figure1">Figure 1</xref>), after screening, exclusion, and stratified sampling across four categories of hematologic diseases, 100 patients were included in the final evaluation cohort (mean age 58.5, SD 16.9 years; range 23&#x2010;88 years; n=47, 47% male patients). The cohort composition is detailed in <xref ref-type="table" rid="table1">Table 1</xref>. The cohort encompassed diseases characterized by quantitative abnormalities in trilineage blood cell counts (category 1 and 3), as well as diseases marked by diagnostically significant pathological cells, including blasts, abnormal promyelocytes, abnormal lymphocytes, and plasma cells (category 2 and 4).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow for the study design. CBC: complete blood count; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e87802_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of clinical cases (n=100).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Values</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">Sex, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">47 (47)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">53 (53)</td></tr><tr><td align="left" valign="top">Age (years), mean (SD; range)</td><td align="left" valign="top">58.5 (16.9; 23-88)</td></tr><tr><td align="left" valign="top" colspan="2">Disease category, n (%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Category 1: myeloproliferative neoplasms</td><td align="left" valign="top">22 (22)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Chronic myeloid leukemia</td><td align="left" valign="top">2 (2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Polycythemia vera</td><td align="char" char="." valign="top">5 (5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Essential thrombocythemia</td><td align="char" char="." valign="top">13 (13)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Myelofibrosis</td><td align="char" char="." valign="top">2 (2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Category 2: acute leukemias and myelodysplastic syndromes</td><td align="left" valign="top">25 (25)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Acute myeloid leukemia<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">9 (9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Acute lymphoblastic leukemia</td><td align="char" char="." valign="top">6 (6)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Myelodysplastic syndromes</td><td align="char" char="." valign="top">3 (3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Acute leukemia of ambiguous lineage</td><td align="char" char="." valign="top">7 (7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Category 3: cytopenic disorders</td><td align="left" valign="top">34 (34)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Aplastic anemia</td><td align="left" valign="top">5 (5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Immune thrombocytopenia</td><td align="char" char="." valign="top">23 (23)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Iron deficiency anemia</td><td align="char" char="." valign="top">4 (4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Megaloblastic anemia</td><td align="char" char="." valign="top">1 (1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Thalassemia</td><td align="char" char="." valign="top">1 (1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Category 4: lymphoproliferative neoplasms</td><td align="left" valign="top">19 (19)</td></tr><tr><td align="left" valign="top">&#x2003;&#x2003;Chronic lymphocytic leukemia</td><td align="left" valign="top">12 (12)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lymphoma with circulating abnormal lymphocytes</td><td align="char" char="." valign="top">5 (5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Multiple myeloma</td><td align="char" char="." valign="top">2 (2)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>The 9 cases of acute myeloid leukemia included 2 cases of acute promyelocytic leukemia.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Interrater Reliability</title><p>Interrater reliability varied across LLMs and evaluator seniority (<xref ref-type="fig" rid="figure2">Figure 2</xref>). Across all 12 evaluations (2 seniority groups&#x00D7;6 dimensions), DeepSeek R1 demonstrated overall excellent reliability, with 9 evaluations achieving excellent reliability (ICC &#x2265;0.75). Grok 4 showed moderate reliability, with 10 evaluations demonstrating good reliability (ICC 0.60&#x2010;0.74) and 2 achieving excellent reliability in the accuracy (ICC 0.782, 95% CI 0.740&#x2010;0.817) and safety (ICC 0.825, 95% CI 0.792&#x2010;0.853) dimensions among senior evaluators. GPT-5 exhibited relatively greater variability, with 7 evaluations showing good reliability; notably, the lowest reliability was observed in the clarity dimension among junior evaluators (ICC 0.602, 95% CI 0.525&#x2010;0.666). The remaining 5 evaluations achieved excellent reliability, primarily in the relevance and safety dimensions.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Forest plot of interrater reliability (intraclass correlation coefficient with 95% CI) for 3 large language models across 6 quality dimensions, stratified by evaluator seniority. Background colors indicate reliability levels: poor (&#x003C;0.40, red), fair (0.40&#x2010;0.59, yellow), good (0.60&#x2010;0.74, light yellow), and excellent (&#x2265;0.75, light green).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e87802_fig02.png"/></fig></sec><sec id="s3-3"><title>Quality Dimension Evaluation</title><p>We compared the LLMs&#x2019; performance across 6 quality dimensions using box plots (<xref ref-type="fig" rid="figure3">Figure 3</xref>). DeepSeek R1 significantly outperformed both GPT-5 and Grok 4 in 5 dimensions: comprehensiveness, accuracy, clarity, relevance, and practicality (all <italic>P</italic>&#x003C;.001). In the safety dimension, DeepSeek R1 achieved a median consensus score of 4.0 (IQR 4.0&#x2010;4.5), which was lower than that of GPT-5 (median consensus score 4.25, IQR 4.0&#x2010;4.5), although this difference did not reach statistical significance (<italic>P</italic>=.94). In comparative analyses, GPT-5 and Grok 4 demonstrated comparable accuracy and relevance, with no statistically significant differences (<italic>P</italic>=.13 and <italic>P</italic>=.30, respectively); however, GPT-5 outperformed Grok 4 in comprehensiveness, practicality, and safety (all <italic>P</italic>&#x003C;.001). Conversely, Grok 4 exhibited significantly greater clarity than GPT-5 (<italic>P</italic>&#x003C;.001).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Performance comparison of GPT-5, Grok 4, and DeepSeek R1 across 6 quality dimensions. Performance is visualized using box plots, where the bounds indicate the first and third quartiles (Q1 and Q3), the internal line represents the median consensus score, and the whiskers extend to 1.5 times the IQR. Individual plotted points represent 4-rater consensus scores.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e87802_fig03.png"/></fig></sec><sec id="s3-4"><title>Task Dimension Evaluation</title><p>We compared the performance of the 3 LLMs across tasks 1 through 5 (<xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>). DeepSeek R1 achieved the highest or tied-for-highest consensus scores in all tasks, with the only nonsignificant difference from GPT-5 occurring in the clinical management task. In the direct comparison between GPT-5 and Grok 4, the 2 models showed no significant difference in the analyzer alert processing task; furthermore, GPT-5 underperformed Grok 4 in the abnormal item identification task but outperformed it in the remaining 4 tasks.</p><p>For task 4 (preliminary diagnosis), we compared top-1 model outputs against both the initial clinical suspicion and the final gold-standard diagnosis under full-context and ablation conditions (<xref ref-type="fig" rid="figure4">Figure 4</xref>). Under full-context conditions, Grok 4 showed the highest concordance with the initial clinical suspicion (96/100, 96%), whereas GPT-5 achieved the highest concordance with the gold-standard diagnosis (93/100, 93%), followed by DeepSeek R1 (92/100, 92%) and Grok 4 (89/100, 89%). After removal of initial clinical suspicion, concordance with the gold-standard diagnosis declined to 79% (79/100) for GPT-5, 77% (77/100) for DeepSeek R1, and 72% (72/100) for Grok 4. The McNemar test showed that these within-model declines were statistically significant for all 3 models (all <italic>P</italic>&#x003C;.001), indicating that initial clinical suspicion materially improved diagnostic accuracy.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Ablation analysis of preliminary diagnosis performance in 3 large language models. Top-1 concordance rates are shown for agreement with the initial clinical suspicion, agreement with the gold-standard diagnosis under full-context prompting, and agreement with the gold-standard diagnosis under the ablation condition. Paired 2&#x00D7;2 contingency tables and the McNemar test were used to evaluate within-model changes after removal of initial clinical suspicion. The numbers shown in the 2&#x00D7;2 contingency tables represent absolute case counts out of the 100 included cases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e87802_fig04.png"/></fig></sec><sec id="s3-5"><title>Error Distributions and Analysis</title><p>We visualized raw individual evaluator rating distributions using heatmaps (<xref ref-type="fig" rid="figure5">Figure 5</xref>), with 1200 ratings generated for each task-model-evaluator seniority group combination (100 cases&#x00D7;2 evaluators within the seniority group&#x00D7;6 quality dimensions). Acknowledging that central tendency metrics (eg, median consensus scores) can obscure low-score assignments in clinically important tasks, we specifically analyzed the tails of the distribution. Although the rating scale theoretically ranged from 1 to 5, no evaluator assigned a score of 1 in this dataset; therefore, only observed score categories (2-5) are displayed. In clinical applications, the absolute incidence of serious errors is far more consequential than average performance; therefore, we quantified responses receiving low scores (&#x003C;3) and qualitatively assessed the clinical risk. These low scores were mainly concentrated in 2 tasks: preliminary diagnosis and clinical management. In the preliminary diagnosis task, Grok 4 showed a relatively high proportion of low scores, at 1.1% (13/1200) for both junior and senior evaluators. In the clinical management task, the proportion of low scores for Grok 4 was even higher, at 2.5% (30/1200) for junior evaluators and 3.8% (46/1200) for senior evaluators.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Heatmap analysis of ratings across 5 task dimensions by evaluator seniority. Score distributions (1-5) for 3 large language models are shown, stratified by junior and senior evaluators, with ratings across the 6 quality dimensions. The color gradient and numeric values in each segment represent the proportion of assignments for each score. Scores of 1 are not displayed because no evaluator assigned a rating of 1 in the study dataset.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e87802_fig05.png"/></fig><p>To elucidate the specific errors underlying these scores, we conducted a narrative review of the error distribution across the 5 task dimensions (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). In the analyzer alert processing task, hallucination errors were most prominent. GPT-5 exhibited 12 such errors, including recommending the manual correction of white blood cell counts while ignoring modern analyzers&#x2019; automatic correction features, and misinterpreting plasma cell percentages from manual differential counts as instrument results&#x2014;all of which could lead to unnecessary manual review and delayed reporting. DeepSeek R1 showed minor misunderstandings regarding platelet count interference factors, whereas Grok 4 displayed no hallucinations in this task.</p><p>For the abnormal item identification task, only DeepSeek R1 exhibited a single hallucination error (misinterpretation of reference interval thresholds), potentially leading to false-positive or false-negative clinical judgments. During the correlation analysis task, all models exhibited reasoning errors, characterized by unwarranted inference of disease status from a single laboratory result without integrating prior clinical information. Such errors could precipitate inappropriate clinical escalation, patient anxiety, and unnecessary overtesting. These inferential errors propagated further into the preliminary diagnosis task, where models used definitive terminology lacking guideline support to assert disease progression, potentially misguiding clinicians toward inappropriate treatment decisions. In the clinical management task, Grok 4 generated 9 reasoning errors by providing generic recommendations not tailored to specific CBC reports, which could compromise patient-specific health care and delay appropriate treatment. By comparison, DeepSeek R1 showed only 1 such case, and GPT-5 displayed none.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This multidimensional comparative evaluation of 3 leading LLMs in interpreting CBC reports for hematologic diseases demonstrates substantial potential for integration into laboratory medicine, offering critical insights for clinical implementation while revealing performance heterogeneity among models in handling complex hematologic inference.</p><p>Significant performance heterogeneity emerged across models in a task-dependent manner. DeepSeek R1 demonstrated superior or equivalent performance across all 6 quality dimensions and 5 clinical tasks. Its reasoning architecture&#x2014;optimized through large-scale reinforcement learning and pretrained on high-quality Chinese medical literature&#x2014;enabled the generation of localized and logically rigorous interpretations that garnered high evaluator recognition [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. More importantly, the model exhibited substantial independent diagnostic capabilities: despite achieving only 80% (80/100) concordance with preliminary clinical suspicion, it attained 92% (92/100) concordance with gold-standard diagnoses, indicating its capacity for independent reasoning that challenges initial suspicions rather than merely reinforcing them [<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>In contrast, Grok 4 exhibited concerning error patterns in clinically critical tasks and systematic confirmation bias. While demonstrating 96% (96/100) concordance with preliminary clinical suspicion, its concordance with final gold-standard diagnoses was merely 89% (89/100), suggesting a tendency to extract and reinforce initial diagnostic cues from prompts rather than conducting independent inference based on laboratory data. Its concordance with the gold-standard diagnosis also declined markedly after ablation, from 89% (89/100) to 72% (72/100) (McNemar <italic>P</italic>&#x003C;.001). Furthermore, it displayed systematic overdiagnosis tendencies, generating generic management recommendations lacking specific data support during clinical management tasks, with 3.8% (46/1200) of outputs receiving low scores in the senior group.</p><p>GPT-5 demonstrated overall robust performance characteristics, trailing DeepSeek R1 across most quality dimensions while retaining the highest blinded concordance after ablation at 79% (79/100). It achieved the highest concordance with gold-standard diagnoses at 93% (93/100) and with initial clinical suspicion at 91% (91/100), indicating favorable knowledge generalization and diagnostic stability. However, the model showed a unique pattern of technical hallucinations in task dimensions, with 12 factual errors in the analyzer alert processing task, indicating that even generational upgrades of general-purpose LLMs cannot eliminate knowledge gaps in specialized technical domains.</p><p>The ablation study findings underscore the importance of integrating clinical context for accurate interpretation of laboratory results, which is highly consistent with the perspective advocated by Plebani [<xref ref-type="bibr" rid="ref27">27</xref>] on laboratory result interpretation: interpreting laboratory data in isolation from the clinical context is inherently limited and potentially misleading. Plebani [<xref ref-type="bibr" rid="ref27">27</xref>] explicitly noted that expecting AI to achieve more accurate diagnoses than well-trained clinicians based solely on reference intervals and test parameters is &#x201C;absurd,&#x201D; emphasizing that true laboratory medicine value realization depends on the integration of pretest probability and comprehensive clinical information. In our study, all models showed significant declines after removal of the initial clinical suspicion, further supporting the notion that, in the absence of clinical information, models tend to rely excessively on statistical associations rather than pathophysiological reasoning. Although GPT-5 demonstrated relative robustness under blinded conditions, this cannot compensate for the systematic loss of diagnostic accuracy when removed from clinical context. Therefore, current applications of LLMs in hematology report interpretation should adhere to the principle advocated by Plebani [<xref ref-type="bibr" rid="ref27">27</xref>] that laboratory results must be interpreted within the context of pretest probability and comprehensive clinical information, restricting AI assistance to scenarios with a complete clinical background rather than permitting its use as an isolated interpreter of test results.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Previous studies have predominantly relied on publicly available question banks or simulated cases with limited validation in real clinical scenarios, functioning essentially as &#x201C;isolated test result interpretation tools&#x201D; rather than &#x201C;integrated clinical decision support systems.&#x201D; Such research only assessed the models&#x2019; ability to interpret individual laboratory parameters without evaluating LLMs&#x2019; performance in complete diagnostic contexts [<xref ref-type="bibr" rid="ref3">3</xref>]. For instance, Kumari et al [<xref ref-type="bibr" rid="ref28">28</xref>] evaluated 3 LLMs on 50 complex, multitopic hematology cases to assess their case-solving performance; Han et al [<xref ref-type="bibr" rid="ref29">29</xref>] assessed ChatGPT&#x2019;s error-correction capability for nucleic acid testing reports by artificially introducing mistakes; and Cadamuro et al [<xref ref-type="bibr" rid="ref3">3</xref>], representing the European Federation of Clinical Chemistry and Laboratory Medicine Working Group, tested ChatGPT&#x2019;s comprehension using 10 simulated reports of common parameters. Additional studies compared LLMs with physician interpretations of laboratory questions from online health forums [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], yet remained confined to general comprehension rather than professional diagnostic reasoning. Unlike prior work, this study integrates comprehensive EHR context (chief concerns and physical examination findings) with raw analyzer data and alert flags, constructing an evaluation framework that more authentically replicates clinical laboratory workflows. This paradigm shift from &#x201C;test interpretation tool&#x201D; to &#x201C;clinical decision support&#x201D; provides a feasible pathway for transitioning laboratory report interpretation from bench to bedside deployment.</p><p>Despite the potential of LLMs, hallucinations and reasoning errors remain fundamental barriers to clinical implementation [<xref ref-type="bibr" rid="ref9">9</xref>]. Based on our error classification framework, differentiated strategies are required for precise risk mitigation. For hallucination errors exhibited by GPT-5, retrieval-augmented generation can anchor model outputs to Clinical and Laboratory Standards Institute (CLSI) guidelines and instrument operation manuals, eliminating fabricated instructions such as &#x201C;manual correction of white blood cell counts&#x201D; [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. For reasoning errors demonstrated by Grok 4, senior hematologists should be specifically tasked with reviewing diagnostic recommendations, leveraging their clinical reasoning to calibrate AI overinference. DeepSeek R1&#x2019;s superior performance as an open-source model suggests that domain-specific fine-tuning based on local practice guidelines can significantly enhance evidence-based reasoning consistency, while future integration of multimodal inputs&#x2014;such as peripheral blood smear images and CBC scattergram raw data&#x2014;may further augment diagnostic reliability [<xref ref-type="bibr" rid="ref9">9</xref>]. Based on these differentiated strategies, we recommend implementing a tiered human-AI collaboration framework: structured tasks, such as abnormal item identification, may be automated using LLMs to improve efficiency, whereas high-risk steps&#x2014;including analyzer alert interpretation, preliminary diagnosis, and clinical management tasks&#x2014;must undergo mandatory review by hematology experts to ensure clinical safety.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations. First, although cases from 4 hospital campuses were included, the retrospective design and stratified sampling based on typical CBC abnormality patterns may introduce selection bias, particularly by excluding diseases with subtle CBC presentations&#x2014;such as early-stage lymphomas and multiple myeloma without cytopenias&#x2014;resulting in an overrepresentation of classic cases. Second, the evaluation was conducted in Chinese, whereas the base training data for GPT-5 and Grok 4 are primarily in English, which may have introduced a language-related confounding factor. In addition, the prompt imposed a strict 500-word output limit, which represented another artificial constraint that may have influenced model behavior. This restriction may have encouraged models to compress content or selectively omit details to comply with the length requirement, thereby affecting the overall quality of the responses. The effects of language and length constraints may have jointly contributed to some of the observed patterns in this study, such as the relatively lower clarity of GPT-5. For Grok 4, the relatively generic and less tailored clinical management recommendations may be related to the model attempting to shorten its response to comply with the prompt instructions. For DeepSeek R1, the 500-word limit was applied only to the final output rather than to reasoning tokens; therefore, the models may not have been evaluated under fully equal length constraints, potentially confounding task performance. Future studies should consider language-matched evaluation settings and more flexible output constraints to better reflect real-world model capabilities. Third, the structured fact-verification checklist standardizes the grading format; however, in the absence of a pre-established, case-specific answer key, distinguishing between verifiable minor errors and significant interpretive deviations requires subjective clinical judgment that may vary among evaluators, leaving residual subjectivity in the rubric definitions. Future studies should develop comprehensive, case-specific scoring rubrics with predefined exemplar responses to further enhance objectivity. Fourth, only standardized final outputs were retained for evaluation, with visible reasoning traces excluded. Because this preprocessing was applied only to DeepSeek R1, which uses a reinforcement learning&#x2013;optimized reasoning mechanism to generate high-quality outputs, evaluators were unable to assess its intermediate deductive process. This may have limited the evaluation of reasoning transparency and error formation mechanisms, particularly for DeepSeek R1. Fifth, the number of hematologists participating in the evaluation was limited and all were from the same health care system, which may introduce institution-specific bias and affect the generalizability of the results. Finally, given the rapid evolution of LLM technology, our results represent only a performance snapshot at a specific time point; subsequent model updates may alter performance characteristics, limiting the strict reproducibility of findings.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study reveals significant performance heterogeneity among LLMs in real-world hematologic CBC report interpretation and distinct patterns of error distribution, providing preliminary evidence for laboratory AI tool selection. Clinical deployment should implement a tiered management strategy based on error classification: restricting LLMs to low-risk structured task assistance while mandating expert review for high-risk diagnostic reasoning tasks. As this represents a single-center, Chinese-language exploratory assessment, this findings are context-dependent and necessitate multicenter, cross-lingual prospective validation to further delineate the safety boundaries and generalizable standards for clinical integration of LLMs.</p></sec></sec></body><back><ack><p>We thank Yunying Chen from the Department of Laboratory Medicine, Hangzhou Children&#x2019;s Hospital, for assistance with statistical analysis and figure preparation. KIMI (Moonshot AI) was used only for English language polishing during manuscript preparation and translation. No AI-generated content was included without author review and editing, and the authors take full responsibility for the final manuscript.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the National Key Technologies R&#x0026;D Program of China (2022YFC3602302).</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>XY, XQ, LF, and DY contributed to the conceptualization, investigation, and methodology of the project. QY and SZ, as selected members of the junior group, and XY and CR, as selected members of the senior group, evaluated the interpretations generated by the large language models. XY, XQ, and LF were responsible for data curation. XY was responsible for the formal data analysis under the supervision of XQ and DY. XY and DY drafted the original manuscript (writing&#x2014;original draft), which was reviewed and edited by all coauthors (writing&#x2014;review and editing).</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CBC</term><def><p>complete blood count</p></def></def-item><def-item><term id="abb3">CLSI</term><def><p>Clinical and Laboratory Standards Institute</p></def></def-item><def-item><term id="abb4">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb5">FAHZU</term><def><p>The First Affiliated Hospital, Zhejiang University School of Medicine</p></def></def-item><def-item><term id="abb6">ICC</term><def><p>intraclass correlation coefficient</p></def></def-item><def-item><term id="abb7">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zayed</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Frans</surname><given-names>G</given-names> </name><name name-style="western"><surname>Delvaux</surname><given-names>N</given-names> </name></person-group><article-title>Evaluating large language models as clinical laboratory test recommenders in primary and emergency care: a crucial step in clinical decision making</article-title><source>Clin Chem Lab Med</source><year>2025</year><month>10</month><day>27</day><volume>63</volume><issue>11</issue><fpage>2186</fpage><lpage>2197</lpage><pub-id pub-id-type="doi">10.1515/cclm-2025-0647</pub-id><pub-id pub-id-type="medline">40802589</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cadamuro</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cabitza</surname><given-names>F</given-names> </name><name name-style="western"><surname>Debeljak</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Potentials and pitfalls of ChatGPT and natural-language artificial intelligence models for the understanding of laboratory medicine test results. An assessment by the European Federation of Clinical Chemistry and Laboratory Medicine (EFLM) Working Group on Artificial Intelligence (WG-AI)</article-title><source>Clin Chem Lab Med</source><year>2023</year><month>06</month><day>27</day><volume>61</volume><issue>7</issue><fpage>1158</fpage><lpage>1166</lpage><pub-id pub-id-type="doi">10.1515/cclm-2023-0355</pub-id><pub-id pub-id-type="medline">37083166</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Pre-trained ChatGPT for report generation in automated microbial identification and antibiotic susceptibility testing systems</article-title><source>Sci Rep</source><year>2025</year><volume>15</volume><fpage>36283</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-22315-5</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Girton</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>DN</given-names> </name><name name-style="western"><surname>Messerlian</surname><given-names>G</given-names> </name><name name-style="western"><surname>Keren</surname><given-names>DF</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT vs medical professional: analyzing responses to laboratory medicine questions on social media</article-title><source>Clin Chem</source><year>2024</year><month>09</month><day>3</day><volume>70</volume><issue>9</issue><fpage>1122</fpage><lpage>1139</lpage><pub-id pub-id-type="doi">10.1093/clinchem/hvae093</pub-id><pub-id pub-id-type="medline">39013110</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jung</surname><given-names>K</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HJ</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Evaluation of the performance of advanced large language models in laboratory medicine using residency examinations</article-title><source>Ann Lab Med</source><year>2026</year><month>05</month><day>1</day><volume>46</volume><issue>3</issue><fpage>327</fpage><lpage>337</lpage><pub-id pub-id-type="doi">10.3343/alm.2025.0200</pub-id><pub-id pub-id-type="medline">41224529</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abusoglu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Serdar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Unlu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Abusoglu</surname><given-names>G</given-names> </name></person-group><article-title>Comparison of three chatbots as an assistant for problem-solving in clinical laboratory</article-title><source>Clin Chem Lab Med</source><year>2024</year><month>06</month><day>25</day><volume>62</volume><issue>7</issue><fpage>1362</fpage><lpage>1366</lpage><pub-id pub-id-type="doi">10.1515/cclm-2023-1058</pub-id><pub-id pub-id-type="medline">38095605</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>X</given-names> </name></person-group><article-title>Assessing DeepSeek-R1 for clinical decision support in multidisciplinary laboratory medicine</article-title><source>J Multidiscip Healthc</source><year>2025</year><volume>18</volume><fpage>4979</fpage><lpage>4988</lpage><pub-id pub-id-type="doi">10.2147/JMDH.S538253</pub-id><pub-id pub-id-type="medline">40823482</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Large language models in medicine: applications, challenges, and future directions</article-title><source>Int J Med Sci</source><year>2025</year><volume>22</volume><issue>11</issue><fpage>2792</fpage><lpage>2801</lpage><pub-id pub-id-type="doi">10.7150/ijms.111780</pub-id><pub-id pub-id-type="medline">40520893</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tam</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>28</day><volume>7</volume><issue>1</issue><fpage>258</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id><pub-id pub-id-type="medline">39333376</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>CHART Collaborative</collab></person-group><article-title>Reporting guidelines for chatbot health advice studies: explanation and elaboration for the Chatbot Assessment Reporting Tool (CHART)</article-title><source>BMJ</source><year>2025</year><month>08</month><day>1</day><volume>390</volume><fpage>e083305</fpage><pub-id pub-id-type="doi">10.1136/bmj-2024-083305</pub-id><pub-id pub-id-type="medline">40750271</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lai</surname><given-names>JX</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Gong</surname><given-names>SS</given-names> </name><etal/></person-group><article-title>Development and validation of an interpretable risk prediction model for the early classification of thalassemia</article-title><source>NPJ Digit Med</source><year>2025</year><month>06</month><day>10</day><volume>8</volume><issue>1</issue><fpage>346</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01766-0</pub-id><pub-id pub-id-type="medline">40494920</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alcazer</surname><given-names>V</given-names> </name><name name-style="western"><surname>Le Meur</surname><given-names>G</given-names> </name><name name-style="western"><surname>Roccon</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluation of a machine-learning model based on laboratory parameters for the prediction of acute leukaemia subtypes: a multicentre model development and validation study in France</article-title><source>Lancet Digit Health</source><year>2024</year><month>05</month><volume>6</volume><issue>5</issue><fpage>e323</fpage><lpage>e333</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00044-X</pub-id><pub-id pub-id-type="medline">38670741</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gao</surname><given-names>HW</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>YY</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Acute leukemia warning model combined CBC and CPD data based on machine learning</article-title><source>Int J Lab Hematol</source><year>2025</year><month>12</month><volume>47</volume><issue>6</issue><fpage>1044</fpage><lpage>1053</lpage><pub-id pub-id-type="doi">10.1111/ijlh.14538</pub-id><pub-id pub-id-type="medline">40765161</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haider</surname><given-names>RZ</given-names> </name><name name-style="western"><surname>Ujjan</surname><given-names>IU</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Urrechaga</surname><given-names>E</given-names> </name><name name-style="western"><surname>Shamsi</surname><given-names>TS</given-names> </name></person-group><article-title>Beyond the in-practice CBC: the research CBC parameters-driven machine learning predictive modeling for early differentiation among leukemias</article-title><source>Diagnostics (Basel)</source><year>2022</year><month>01</month><day>7</day><volume>12</volume><issue>1</issue><fpage>138</fpage><pub-id pub-id-type="doi">10.3390/diagnostics12010138</pub-id><pub-id pub-id-type="medline">35054304</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x00C7;ubuk&#x00E7;u</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Topcu</surname><given-names>D&#x0130;</given-names> </name><name name-style="western"><surname>Yenice</surname><given-names>S</given-names> </name></person-group><article-title>Machine learning-based clinical decision support using laboratory data</article-title><source>Clin Chem Lab Med</source><year>2023</year><month>11</month><volume>62</volume><issue>5</issue><fpage>793</fpage><lpage>823</lpage><pub-id pub-id-type="doi">10.1515/cclm-2023-1037</pub-id><pub-id pub-id-type="medline">38015744</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miller</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Valdes</surname><given-names>R</given-names> </name></person-group><article-title>Rigorous validation of machine learning in laboratory medicine: guidance toward quality improvement</article-title><source>Crit Rev Clin Lab Sci</source><year>2025</year><month>08</month><volume>62</volume><issue>5</issue><fpage>327</fpage><lpage>346</lpage><pub-id pub-id-type="doi">10.1080/10408363.2025.2488842</pub-id><pub-id pub-id-type="medline">40247648</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Meyer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Soleman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Riese</surname><given-names>J</given-names> </name><name name-style="western"><surname>Streichert</surname><given-names>T</given-names> </name></person-group><article-title>Comparison of ChatGPT, Gemini, and Le Chat with physician interpretations of medical laboratory questions from an online health forum</article-title><source>Clin Chem Lab Med</source><year>2024</year><month>11</month><day>26</day><volume>62</volume><issue>12</issue><fpage>2425</fpage><lpage>2434</lpage><pub-id pub-id-type="doi">10.1515/cclm-2024-0246</pub-id><pub-id pub-id-type="medline">38804035</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bhasuran</surname><given-names>B</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Quality of answers of generative large language models versus peer users for interpreting laboratory test results for lay patients: evaluation study</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>17</day><volume>26</volume><fpage>e56655</fpage><pub-id pub-id-type="doi">10.2196/56655</pub-id><pub-id pub-id-type="medline">38630520</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>D</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sheng</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>TY</given-names> </name></person-group><article-title>DeepSeek&#x2019;s &#x201C;low-cost&#x201D; adoption across China&#x2019;s hospital systems: too fast, too soon?</article-title><source>JAMA</source><year>2025</year><month>06</month><day>3</day><volume>333</volume><issue>21</issue><fpage>1866</fpage><lpage>1869</lpage><pub-id pub-id-type="doi">10.1001/jama.2025.6571</pub-id><pub-id pub-id-type="medline">40293869</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khoury</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Solary</surname><given-names>E</given-names> </name><name name-style="western"><surname>Abla</surname><given-names>O</given-names> </name><etal/></person-group><article-title>The 5th edition of the World Health Organization Classification of Haematolymphoid Tumours: myeloid and histiocytic/dendritic neoplasms</article-title><source>Leukemia</source><year>2022</year><month>07</month><volume>36</volume><issue>7</issue><fpage>1703</fpage><lpage>1719</lpage><pub-id pub-id-type="doi">10.1038/s41375-022-01613-1</pub-id><pub-id pub-id-type="medline">35732831</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cicchetti</surname><given-names>DV</given-names> </name></person-group><article-title>Guidelines, criteria, and rules of thumb for evaluating normed and standardized assessment instruments in psychology</article-title><source>Psychol Assess</source><year>1994</year><volume>6</volume><issue>4</issue><fpage>284</fpage><lpage>290</lpage><pub-id pub-id-type="doi">10.1037/1040-3590.6.4.284</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tordjman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yuce</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparative benchmarking of the DeepSeek large language model on medical tasks and clinical reasoning</article-title><source>Nat Med</source><year>2025</year><month>08</month><volume>31</volume><issue>8</issue><fpage>2550</fpage><lpage>2555</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03726-3</pub-id><pub-id pub-id-type="medline">40267969</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fujarski</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Benchmark evaluation of DeepSeek large language models in clinical decision-making</article-title><source>Nat Med</source><year>2025</year><month>08</month><volume>31</volume><issue>8</issue><fpage>2546</fpage><lpage>2549</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03727-2</pub-id><pub-id pub-id-type="medline">40267970</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name></person-group><article-title>Performance evaluation of large language models with chain-of-thought reasoning ability in clinical laboratory case interpretation</article-title><source>Clin Chem Lab Med</source><year>2025</year><month>07</month><day>28</day><volume>63</volume><issue>8</issue><fpage>e199</fpage><lpage>e201</lpage><pub-id pub-id-type="doi">10.1515/cclm-2025-0055</pub-id><pub-id pub-id-type="medline">40023838</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Plebani</surname><given-names>M</given-names> </name></person-group><article-title>ChatGPT: Angel or demond? Critical thinking is still needed</article-title><source>Clin Chem Lab Med</source><year>2023</year><month>06</month><day>27</day><volume>61</volume><issue>7</issue><fpage>1131</fpage><lpage>1132</lpage><pub-id pub-id-type="doi">10.1515/cclm-2023-0387</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kumari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kumari</surname><given-names>A</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Large language models in hematology case solving: a comparative study of ChatGPT-3.5, Google Bard, and Microsoft Bing</article-title><source>Cureus</source><year>2023</year><month>08</month><volume>15</volume><issue>8</issue><fpage>e43861</fpage><pub-id pub-id-type="doi">10.7759/cureus.43861</pub-id><pub-id pub-id-type="medline">37736448</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shan</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation of error detection and treatment recommendations in nucleic acid test reports using ChatGPT models</article-title><source>Clin Chem Lab Med</source><year>2025</year><month>08</month><day>26</day><volume>63</volume><issue>9</issue><fpage>1698</fpage><lpage>1708</lpage><pub-id pub-id-type="doi">10.1515/cclm-2025-0089</pub-id><pub-id pub-id-type="medline">40249886</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nanua</surname><given-names>S</given-names> </name><name name-style="western"><surname>Steward</surname><given-names>R</given-names> </name><name name-style="western"><surname>Neely</surname><given-names>B</given-names> </name><name name-style="western"><surname>Datto</surname><given-names>M</given-names> </name><name name-style="western"><surname>Youens</surname><given-names>K</given-names> </name></person-group><article-title>Retrieval-augmented generation for interpreting clinical laboratory regulations using large language models</article-title><source>J Pathol Inform</source><year>2025</year><month>11</month><volume>19</volume><fpage>100520</fpage><pub-id pub-id-type="doi">10.1016/j.jpi.2025.100520</pub-id><pub-id pub-id-type="medline">41244595</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Evaluation checklists for quality dimensions.</p><media xlink:href="jmir_v28i1e87802_app1.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Evaluation checklists for task dimensions.</p><media xlink:href="jmir_v28i1e87802_app2.xlsx" xlink:title="XLSX File, 14 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Performance comparison of GPT-5, Grok 4, and DeepSeek R1 across 5 task dimensions.</p><media xlink:href="jmir_v28i1e87802_app3.docx" xlink:title="DOCX File, 586 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Narrative review of model errors across 5 task dimensions.</p><media xlink:href="jmir_v28i1e87802_app4.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material></app-group></back></article>