<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e91222</article-id><article-id pub-id-type="doi">10.2196/91222</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Quality Evaluation of Large Language Model&#x2013;Assisted Generation of Initial Senior Physician Ward Round Records for Patients With Acute Poisoning: Cross-Sectional Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zhu</surname><given-names>Junping</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pan</surname><given-names>Wei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Yonghong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yan</surname><given-names>Kui</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fang</surname><given-names>Zhicheng</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yang</surname><given-names>Xianyi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Emergency Medicine, Taihe Hospital, Hubei University of Medicine</institution><addr-line>Renmin South Road 32, Maojian District</addr-line><addr-line>Shiyan</addr-line><addr-line>Hubei</addr-line><country>China</country></aff><aff id="aff2"><institution>Hubei Provincial Clinical Medical Research Center for Pneumoconiosis and Poisoning, Hubei Provincial Hospital of Integrated Chinese &#x0026; Western Medicine</institution><addr-line>Wuhan</addr-line><addr-line>Hubei</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Steenstra</surname><given-names>Ivan</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Qian</surname><given-names>Guangwu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Zaghir</surname><given-names>Jamil</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Xianyi Yang, MD, Department of Emergency Medicine, Taihe Hospital, Hubei University of Medicine, Renmin South Road 32, Maojian District, Shiyan, Hubei, 442012, China, 86 13593776564; <email>hbsyyxy@163.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>30</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e91222</elocation-id><history><date date-type="received"><day>11</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>07</day><month>06</month><year>2026</year></date><date date-type="accepted"><day>08</day><month>06</month><year>2026</year></date></history><copyright-statement>&#x00A9; Junping Zhu, Wei Pan, Yonghong Wang, Kui Yan, Zhicheng Fang, Xianyi Yang. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 30.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e91222"/><abstract><sec><title>Background</title><p>Large language models (LLMs) have shown potential in medical text generation. Senior physician ward round records are critical documents whose quality reflects the accuracy and continuity of clinical decision-making. The initial record is particularly important, as it represents the first formal senior-level synthesis of a patient&#x2019;s presentation, establishing the diagnostic framework and treatment direction for all subsequent care. The quality of LLM-generated initial records for acute poisoning remains unclear.</p></sec><sec><title>Objective</title><p>Focusing on patients with acute poisoning, this study systematically compared medical record writing quality among DeepSeek, ChatGPT (OpenAI), and human physicians to clarify the clinical value of LLMs.</p></sec><sec sec-type="methods"><title>Methods</title><p>A retrospective analysis included 256 cases of acute poisoning from the emergency department ward of Taihe Hospital, Hubei University of Medicine. DeepSeek-V3.2-Exp and GPT-5.1 generated senior physician ward round records from standardized Chinese-language prompts, which were compared with the original medical charts. Blinded evaluations were performed by 3 senior emergency physicians, who scored overall quality across 5 dimensions on a Likert scale (from 1 to 5): case characteristics, current diagnosis, differential diagnosis, treatment plan, and prognosis assessment. Error frequencies were documented under 3 categories (inaccuracies, omissions, and fabrications), and potential harm was assessed using a modified Agency for Healthcare Research and Quality harm scale.</p></sec><sec sec-type="results"><title>Results</title><p>DeepSeek achieved the highest mean total score (24.14, SD 0.90), which was significantly higher than ChatGPT (23.30, SD 1.42; <italic>P</italic>&#x003C;.001) and the physician group (23.86, SD 0.86; <italic>P</italic>=.02). DeepSeek had the highest score for differential diagnosis (mean 4.98, SD 0.10) and prognosis assessment (mean 4.73, SD 0.42) and was comparable to physicians in case characteristics (DeepSeek: mean 4.90, SD 0.23; physicians: mean 4.96, SD 0.15; <italic>P</italic>&#x003E;.001). For drug and pesticide poisoning, DeepSeek's mean total scores (24.23, SD 0.75 and 23.92, SD 1.14, respectively) were significantly higher than ChatGPT&#x2019;s (23.34, SD 1.33 and 22.78, SD 1.33, respectively; <italic>P</italic>&#x003C;.001 for both). In biological toxin poisoning, DeepSeek (mean 23.97, SD 0.96) and physicians (mean 24.26, SD 0.62) scored similarly, both significantly higher than ChatGPT (mean 22.53, SD 1.86; <italic>P</italic>&#x003C;.001). Overall potential harm scores were low across all 3 groups (&#x003C;1 point), without significant differences (<italic>P</italic>=.38), although high-harm records were significantly more frequent in both LLM groups than in the physician group (<italic>P</italic>=.02).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs performed satisfactorily in generating initial senior physician ward round records for acute poisoning, with DeepSeek particularly outperforming the physician group in differential diagnosis and prognosis assessment and showing potential to assist clinical documentation. However, the significantly higher proportion of high-harm errors in LLM-generated records underscores the need for mandatory physician review before incorporation into official medical records.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>generative artificial intelligence</kwd><kwd>emergency department</kwd><kwd>ward round records</kwd><kwd>clinical documentation</kwd><kwd>acute poisoning</kwd><kwd>medical records</kwd><kwd>DeepSeek</kwd><kwd>ChatGPT</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Acute poisoning is a common critical condition in emergency departments, characterized by rapid progression and high mortality rates, which places significant demands on clinicians&#x2019; rapid diagnostic and management capabilities [<xref ref-type="bibr" rid="ref1">1</xref>]. Senior physician ward rounds serve as a core component of medical quality control, and senior physician ward round records are essential medical documents for organizing clinical information and clarifying diagnostic and therapeutic directions. They also serve as critical evidence for teaching, quality traceability, and medical dispute resolution [<xref ref-type="bibr" rid="ref2">2</xref>]. Among these documents, the initial senior physician ward round record is of particular significance: it represents the first formal synthesis of a patient&#x2019;s clinical presentation by a senior physician, establishing the diagnostic framework and treatment direction that guides all subsequent care. Its quality is, therefore, directly linked to patient safety and care continuity. However, heavy clinical workloads often prevent physicians from ensuring the standardization and completeness of ward round records, thereby affecting the quality of care [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>In recent years, large language models (LLMs) such as ChatGPT and DeepSeek have been increasingly applied in the medical field [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], demonstrating potential in patient education and clinical support roles [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. Williams et al [<xref ref-type="bibr" rid="ref9">9</xref>] showed that discharge summaries generated by GPT-4 were comparable in quality to those written by physicians. G&#x00FC;n [<xref ref-type="bibr" rid="ref10">10</xref>] found high consistency between ChatGPT and emergency physicians in blood gas analysis interpretation. LLMs possess efficient text generation and information integration capabilities and can theoretically generate ward round record frameworks that comply with clinical standards, reducing physicians&#x2019; documentation workload [<xref ref-type="bibr" rid="ref11">11</xref>]. However, for senior physician ward round records in emergency department wards&#x2014;a document type requiring both timeliness and clinical complexity&#x2014;validation based on real clinical data is lacking, particularly in Chinese-language medical documentation and the specialized field of acute poisoning [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>DeepSeek, as a representative domestic LLM, features free use and strong Chinese-language comprehension capabilities [<xref ref-type="bibr" rid="ref13">13</xref>]. This study, based on real clinical records from an emergency department, aimed to systematically compare the quality of initial senior physician ward round records generated by DeepSeek and ChatGPT against the quality of those written by human physicians using real acute poisoning cases across multiple evaluation dimensions. We hypothesized that LLMs would demonstrate comparable or superior performance to that of physicians in structured documentation tasks&#x2014;particularly in information integration and differential diagnosis&#x2014;while potentially showing deficiencies in complex clinical judgment domains such as definitive diagnosis formulation.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This was a retrospective cross-sectional study. The LLM-generated records and blinded evaluations were conducted from September 2025 to December 2025 at the emergency department of Taihe Hospital, Hubei University of Medicine; the clinical records analyzed were from patients with acute poisoning admitted between January 2023 and December 2024.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>The study protocol was approved by the hospital&#x2019;s ethics committee (ethics approval number: 2025KS187). All medical record data were deidentified prior to analysis.</p></sec><sec id="s2-3"><title>Study Population</title><p>Inclusion criteria were (1) patients with acute poisoning admitted between January 2023 and December 2024; (2) complete medical records, including initial progress notes, attending physician ward round records, and initial senior physician ward round records; (3) hospital stay of 48 hours or longer; and (4) confirmed diagnosis of acute poisoning. Exclusion criteria were (1) incomplete medical records; (2) concurrent severe trauma, advanced cancer, or other critical conditions; and (3) chronic poisoning cases.</p><p>A total of 256 patient records were ultimately included and were categorized by poisoning type: 43% (n=110) cases of drug poisoning, 18.8% (n=48) cases of pesticide poisoning, 12.5% (n=32) cases of gas poisoning, 12.1% (n=31) cases of biological toxin poisoning, and 13.7% (n=35) cases of other poisoning (including food poisoning, alcohol poisoning, rodenticide poisoning, and chemical poisoning; <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The distribution of poisoning types reflects the actual clinical epidemiology of acute poisoning in our emergency department, with drug poisoning being the most prevalent category, consistent with national epidemiological trends [<xref ref-type="bibr" rid="ref1">1</xref>]. No artificial balancing of the dataset was performed as the primary objective was to evaluate LLM performance under real-world, ecologically valid conditions.</p></sec><sec id="s2-4"><title>Ward Round Record Generation</title><p>The study workflow is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>. Initial progress notes and attending physician ward round records were extracted from the electronic medical record system as input data for each patient. DeepSeek-V3.2-Exp (November 2025 version) and GPT-5.1 (November 2025 version) were used to generate initial senior physician ward round records according to a standardized prompt template. All input data and prompts were in Chinese, consistent with standard clinical practice in our Chinese hospital setting, and all generated records were in Chinese, matching the language of the source medical records. The prompt design referenced the Basic Standards for Medical Record Writing issued by the National Health Commission of China on March 1, 2010. The original senior physician ward round records in the medical charts served as the control group.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Research flowchart. AHRQ: Agency for Healthcare Research and Quality; API: application programming interface; EMR: electronic medical record.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e91222_fig01.png"/></fig><p>DeepSeek-V3.2-Exp was accessed via an application programming interface (API), with parameters set to a temperature of 0.3, top_p of 0.95, max_tokens of 4096, frequency_penalty of 0, and presence_penalty of 0. GPT-5.1 was accessed through the official web interface with the memory function disabled, with conversation history cleared before each case to ensure session independence. The 2 LLMs were accessed via different methods owing to practical constraints during the study period: DeepSeek offered a stable API end point enabling systematic batch processing with controlled parameters, whereas API access to ChatGPT was not available to our research team, necessitating web interface use. A relatively low temperature value (0.3) was selected for DeepSeek to enhance output determinism and reduce the impact of randomness on documentation quality. All initial senior physician ward round records were generated during the final week of November 2025 to control for potential variation from model version updates. Each case was generated only once without repeated generation or manual selection to ensure objectivity of results.</p></sec><sec id="s2-5"><title>Evaluation Methods</title><p>The evaluation team consisted of 3 senior emergency physicians (attending physician level or above; mean experience 10 years). A double-blind design was used, with evaluators unaware of the record sources. Using the initial progress notes and attending physician ward round records as reference, evaluators scored both LLM-generated and actual senior physician ward round records on 5 dimensions: case characteristics, current diagnosis, differential diagnosis, treatment plan, and prognosis assessment. A self-designed senior physician ward round record scoring form was used as the evaluation tool (<xref ref-type="table" rid="table1">Table 1</xref>), with each dimension scored on a Likert scale from 1 to 5.</p><p>Error frequencies were documented under three prespecified categories: (1) inaccuracies&#x2014;errors in how existing clinical information was processed or recorded, such as misattributed laboratory values, incorrect dates, or copying and pasting errors; (2) omissions&#x2014;clinically relevant information present in the input data that was absent from the generated record; and (3) fabrications&#x2014;content in the generated record that could not be traced to or was directly contradicted by the input clinical data, such as documenting a treatment not administered or a physical finding inconsistent with the patient&#x2019;s actual examination. We use the term &#x201C;fabrication&#x201D; in preference to &#x201C;hallucination&#x201D; throughout this paper as it more precisely denotes the generation of clinically unsupported content. In physician-written records, fabrication-type errors corresponded to documentation of events or findings that did not occur. This taxonomy was adapted from established LLM error classification frameworks for clinical text [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Additionally, potential harm was assessed using a modified Agency for Healthcare Research and Quality harm scale [<xref ref-type="bibr" rid="ref16">16</xref>].</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Evaluation dimensions of the senior physician ward round record scoring form<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">Evaluation content</td></tr></thead><tbody><tr><td align="left" valign="top">Case characteristics</td><td align="left" valign="top">Summarization of chief concern and present illness, integration of physical signs and auxiliary examinations, and summary of disease progression</td></tr><tr><td align="left" valign="top">Current diagnosis</td><td align="left" valign="top">Accuracy of primary diagnosis, completeness of diagnostic evidence, and rationality of diagnostic reasoning</td></tr><tr><td align="left" valign="top">Differential diagnosis</td><td align="left" valign="top">Comprehensiveness of differential diagnoses, sufficiency of differentiating evidence, and rationality of diagnostic exclusion</td></tr><tr><td align="left" valign="top">Treatment plan</td><td align="left" valign="top">Specificity of treatment regimen, plan for further investigations, and key points of condition monitoring</td></tr><tr><td align="left" valign="top">Prognosis assessment</td><td align="left" valign="top">Basis of prognostic judgment, identification of risk factors, and rationality of outcome prediction</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Each dimension was scored on a 5-point Likert scale: 1=&#x201C;strongly disagree,&#x201D; 2=&#x201C;disagree,&#x201D; 3=&#x201C;neutral,&#x201D; 4=&#x201C;agree,&#x201D; and 5=&#x201C;strongly agree.&#x201D;</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-6"><title>Statistical Analysis</title><p>Statistical analysis was performed using SPSS (version 27.0; IBM Corp). The intraclass correlation coefficient was used to assess interrater reliability, with mean scores used for subsequent analysis. Continuous variables were expressed as means and SDs with 1-way ANOVA for 3-group comparisons and Bonferroni correction for pairwise post hoc comparisons. Categorical variables were expressed as frequencies and percentages, with chi-square or Fisher exact tests for between-group comparisons. A <italic>P</italic> value of less than .05 was considered statistically significant.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Interrater Reliability</title><p>Interrater reliability among the 3 evaluators was good, with an intraclass correlation coefficient for total scores of 0.839 (95% CI 0.819&#x2010;0.858; <italic>P</italic>&#x003C;.001). Mean scores for each dimension and total scores were used for subsequent analysis.</p></sec><sec id="s3-2"><title>Total Score Comparison</title><p>Total scores differed significantly among the 3 groups (<italic>F</italic><sub>2,765</sub>=38.918; <italic>P</italic>&#x003C;.001). Bonferroni post hoc comparisons showed that the DeepSeek group achieved the highest total score, significantly higher than both the ChatGPT group and the physician group; the physician group&#x2019;s total score was also significantly higher than that of the ChatGPT group (<xref ref-type="fig" rid="figure2">Figure 2</xref> and <xref ref-type="table" rid="table2">Table 2</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Comparison of total quality scores among GPT-5.1&#x2013;, DeepSeek-V3.2-Exp&#x2013;, and physician-written records. The boxes represent IQRs, the horizontal lines indicate medians, the diamonds denote means, and the whiskers extend to 1.5 times the IQR. Individual data points are shown as jittered dots. One-way ANOVA with Bonferroni post hoc correction was conducted. *<italic>P</italic>&#x003C;.05; ****<italic>P</italic>&#x003C;.0001.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e91222_fig02.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of scores across 5 dimensions among the 3 groups.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="middle">Dimension</td><td align="left" valign="middle">GPT-5.1 score (n=256), mean (SD)</td><td align="left" valign="middle">DeepSeek-V3.2-Exp score (n=256), mean (SD)</td><td align="left" valign="middle">Physician score (n=256), mean (SD)</td><td align="left" valign="middle"><italic>F</italic> test (<italic>df</italic>)<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="middle"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="middle">Case characteristics (range 1-5)</td><td align="left" valign="middle">4.81 (0.41)</td><td align="left" valign="middle">4.96 (0.23)<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="middle">4.90 (0.15)</td><td align="left" valign="middle">18.27 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Current diagnosis (range 1-5)</td><td align="left" valign="middle">4.43 (0.58)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="middle">4.69 (0.43)<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="middle">4.86 (0.30)</td><td align="left" valign="middle">57.94 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Differential diagnosis (range 1-5)</td><td align="left" valign="middle">4.61 (0.73)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="middle">4.98 (0.10)<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="middle">4.75 (0.36)</td><td align="left" valign="middle">37.21 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Treatment plan (range 1-5)</td><td align="left" valign="middle">4.81 (0.38)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="middle">4.83 (0.37)</td><td align="left" valign="middle">4.89 (0.30)</td><td align="left" valign="middle">4.10 (2, 765)</td><td align="left" valign="middle">.02</td></tr><tr><td align="left" valign="middle">Prognosis assessment (range 1-5)</td><td align="left" valign="middle">4.65 (0.47)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="middle">4.73 (0.42)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="middle">4.39 (0.54)</td><td align="left" valign="middle">35.67 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Total score (range 5-25)</td><td align="left" valign="middle">23.30 (1.42)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="middle">24.14 (0.90)<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="middle">23.86 (0.86)</td><td align="left" valign="middle">38.92 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup><italic>F</italic> statistic from one-way ANOVA.</p></fn><fn id="table2fn2"><p><sup>b</sup><italic>P</italic>&#x003C;.05 vs the GPT-5.1 group (Bonferroni correction). By definition, this footnote does not appear in the &#x201C;GPT-5.1&#x201D; column.</p></fn><fn id="table2fn3"><p><sup>c</sup><italic>P</italic>&#x003C;.05 vs the physician group. By definition, this footnote does not appear in the &#x201C;Physician&#x201D; column.</p></fn><fn id="table2fn4"><p><sup>d</sup>Significance vs both groups.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Comparison of Dimension Scores</title><p>As shown in <xref ref-type="table" rid="table2">Table 2</xref>, DeepSeek demonstrated optimal performance in both differential diagnosis (mean 4.98, SD 0.10) and prognosis assessment (mean 4.73, SD 0.42). DeepSeek&#x2019;s case characteristics score (mean 4.96, SD 0.23) was statistically comparable to that of the physician group (mean 4.90, SD 0.15; <italic>P</italic>&#x003E;.05), indicating no significant difference between them. The physician group achieved the highest scores in current diagnosis (mean 4.86, SD 0.30) and treatment planning (mean 4.89, SD 0.30). Notably, the physician group&#x2019;s score in prognosis assessment (mean 4.39, SD 0.54) was significantly lower than that of both LLMs (<italic>P</italic>&#x003C;.001).</p></sec><sec id="s3-4"><title>Subgroup Analysis by Poisoning Type</title><p>In drug poisoning cases, the DeepSeek group scored highest (mean 24.23, SD 0.75), with statistically significant differences in all pairwise comparisons. In pesticide poisoning cases, there was no significant difference between the DeepSeek and physician groups, but both scores were significantly higher than that of the ChatGPT group. In biological toxin poisoning cases, the physician and DeepSeek groups had similar scores, both significantly higher than those of ChatGPT. In gas poisoning and other poisoning cases, no statistically significant differences were observed among the 3 groups (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Subgroup analysis by poisoning type.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="middle">Poisoning type</td><td align="left" valign="middle">Cases (n=256), n (%)</td><td align="left" valign="middle">GPT-5.1 score (range 5-25), mean (SD)</td><td align="left" valign="middle">DeepSeek-V3.2-Exp score (range 5-25), mean (SD)</td><td align="left" valign="middle">Physicians score (range 5-25), mean (SD)</td><td align="left" valign="middle"><italic>F</italic> test (<italic>df</italic>)</td><td align="left" valign="middle"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="middle">Drug</td><td align="left" valign="middle">110 (43)</td><td align="left" valign="middle">23.34 (1.33)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="middle">24.23 (0.75)<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td><td align="left" valign="middle">23.79 (0.88)</td><td align="left" valign="middle">21.39 (2, 327)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Pesticide</td><td align="left" valign="middle">48 (18.8)</td><td align="left" valign="middle">22.78 (1.33)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="middle">23.92 (1.14)<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="middle">23.77 (1.00)</td><td align="left" valign="middle">13.46 (2, 141)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Gas</td><td align="left" valign="middle">32 (12.5)</td><td align="left" valign="middle">24.29 (0.95)</td><td align="left" valign="middle">24.42 (0.86)</td><td align="left" valign="middle">23.95 (0.79)</td><td align="left" valign="middle">2.50 (2, 93)</td><td align="left" valign="middle">.09</td></tr><tr><td align="left" valign="middle">Biological toxin</td><td align="left" valign="middle">31 (12.1)</td><td align="left" valign="middle">22.53 (1.86)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="middle">23.97 (0.96)<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="middle">24.26 (0.62)</td><td align="left" valign="middle">16.71 (2, 90)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Other</td><td align="left" valign="middle">35 (13.7)</td><td align="left" valign="middle">23.70 (1.03)</td><td align="left" valign="middle">24.01 (0.88)</td><td align="left" valign="middle">23.80 (0.73)</td><td align="left" valign="middle">1.14 (2, 102)</td><td align="left" valign="middle">.33</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup><italic>P</italic>&#x003C;.05 vs the physician group. By definition, this footnote does not appear in the &#x201C;Physician&#x201D; column.</p></fn><fn id="table3fn2"><p><sup>b</sup>Significance vs both groups<italic>.</italic></p></fn><fn id="table3fn3"><p><sup>c</sup><italic>P</italic>&#x003C;.05 vs the GPT-5.1 group (Bonferroni correction). By definition, this footnote does not appear in the &#x201C;GPT-5.1&#x201D; column.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Error Type Analysis and Potential Harm Assessment</title><p>Error frequencies and types across the 3 groups are shown in <xref ref-type="table" rid="table4">Tables 4</xref> and <xref ref-type="table" rid="table5">5</xref>. Physician records had the lowest mean number of errors per record (1.38, SD 1.15), whereas DeepSeek (2.56, SD 1.72) and ChatGPT (2.31, SD 1.58) had significantly more errors (<italic>P</italic>&#x003C;.001). Both LLMs showed significantly higher rates of omission errors (DeepSeek: mean 1.62, SD 1.25; ChatGPT: mean 1.45, SD 1.12) than the physician group (mean 0.68, SD 0.82; <italic>P</italic>&#x003C;.001). Inaccuracy errors also differed significantly among groups (<italic>P</italic>=.008), whereas fabrication error rates showed no significant difference (<italic>P</italic>=.28).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Analysis of error types across the 3 groups.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="middle">Error type</td><td align="left" valign="middle">GPT-5.1, mean (SD)</td><td align="left" valign="middle">DeepSeek-V3.2-Exp, mean (SD)</td><td align="left" valign="middle">Physicians, mean (SD)</td><td align="left" valign="middle"><italic>F</italic> test (<italic>df</italic>)</td><td align="left" valign="middle"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="middle">Inaccuracies</td><td align="left" valign="middle">0.62 (0.70)</td><td align="left" valign="middle">0.68 (0.82)</td><td align="left" valign="middle">0.42 (0.61)</td><td align="left" valign="middle">4.85 (2, 765)</td><td align="left" valign="middle">.008</td></tr><tr><td align="left" valign="middle">Omissions</td><td align="left" valign="middle">1.45 (1.12)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="middle">1.62 (1.25)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="middle">0.68 (0.82)</td><td align="left" valign="middle">38.92 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr><tr><td align="left" valign="middle">Fabrications</td><td align="left" valign="middle">0.24 (0.46)</td><td align="left" valign="middle">0.26 (0.48)</td><td align="left" valign="middle">0.28 (0.52)</td><td align="left" valign="middle">1.28 (2, 765)</td><td align="left" valign="middle">.28</td></tr><tr><td align="left" valign="middle">Total errors</td><td align="left" valign="middle">2.31 (1.58)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="middle">2.56 (1.72)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td><td align="left" valign="middle">1.38 (1.15)</td><td align="left" valign="middle">28.56 (2, 765)</td><td align="left" valign="middle">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup><italic>P</italic>&#x003C;.001 vs the physician group (Bonferroni correction; see the Methods section for definitions of error categories).</p></fn></table-wrap-foot></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Examples of typical errors from the 3 groups.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Source and error type</td><td align="left" valign="bottom">Error description</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">DeepSeek-V3.2-Exp</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Inaccuracy</td><td align="left" valign="top">The Poisoning Severity Score was systematically overestimated by 1 point relative to the actual severity in several cases.</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Omission</td><td align="left" valign="top">Omission of vital sign changes and positive findings from critical abdominal examinations.</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fabrication</td><td align="left" valign="top">A superficial scratch on the arm of a patient with depression was attributed to a dermatological condition not present in the source record.</td></tr><tr><td align="left" valign="top" colspan="3">GPT-5.1</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Inaccuracy</td><td align="left" valign="top">The correct diagnosis was included within the differential diagnosis list rather than as the primary diagnosis; the chief concern did not conform to standard medical record writing format.</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Omission</td><td align="left" valign="top">Omission of vital sign changes and positive findings from critical abdominal examinations.</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fabrication</td><td align="left" valign="top">For a patient who was critically ill, first-level nursing care was ordered instead of the required special-level nursing care.</td></tr><tr><td align="left" valign="top" colspan="3">Physicians</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Inaccuracy</td><td align="left" valign="top">Copying and pasting errors, such as transcribing &#x201C;pleural effusion&#x201D; as &#x201C;pericardial effusion&#x201D;; incorrect dosage units (mg vs g); and incorrect date entry.</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Omission</td><td align="left" valign="top">Failure to include certain minor findings from imaging studies in the diagnosis.</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fabrication</td><td align="left" valign="top">Antihypertensive medications were not used during the actual treatment but were recorded as administered.</td></tr></tbody></table></table-wrap><p>Overall, potential harm scores (measured on a scale from 0 to 7) were low across all 3 groups (<xref ref-type="table" rid="table6">Table 6</xref>; physician group: mean 0.45, SD 0.72; DeepSeek: mean 0.82, SD 0.95; ChatGPT: mean 0.71, SD 0.88; no statistically significant difference among groups: <italic>F</italic>=0.97 and <italic>P</italic>=.38). Harm scores per individual error also showed no significant difference (<italic>P</italic>=.56). However, the distribution of high-harm records (&#x2265;4 points) differed significantly (DeepSeek: 8/256, 3.1%; ChatGPT: 5/256, 2%; physician group: 2/256, 0.8%; <italic>&#x03C7;</italic><sup>2</sup><sub>2</sub>=7.8; <italic>P</italic>=.02).</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Potential harm assessment results across the 3 groups<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Indicator</td><td align="left" valign="bottom">GPT-5.1</td><td align="left" valign="bottom">DeepSeek-V3.2-Exp</td><td align="left" valign="bottom">Physicians</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Overall harm score, mean (SD)</td><td align="left" valign="top">0.71 (0.88)</td><td align="left" valign="top">0.82 (0.95)</td><td align="left" valign="top">0.45 (0.72)</td><td align="left" valign="top">.38</td></tr><tr><td align="left" valign="top">Harm score per error, mean (SD)</td><td align="left" valign="top">1.35 (1.01)</td><td align="left" valign="top">1.38 (1.05)</td><td align="left" valign="top">1.32 (0.98)</td><td align="left" valign="top">.56</td></tr><tr><td align="left" valign="top">High-harm records (&#x2265;4 points; n=256), n (%)</td><td align="left" valign="top">5 (2)</td><td align="left" valign="top">8 (3.1)</td><td align="left" valign="top">2 (0.8)</td><td align="left" valign="top">.02</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Harm scores were assessed using a modified Agency for Healthcare Research and Quality scale (0&#x2010;7 points): 0=no potential for harm; 1=potential for emotional distress or mild, transient discomfort; 2=potential for requiring additional treatment; 3=potential for temporary harm (bodily or psychological injury, likely not permanent); 4=potential for permanent harm; 5=potential for lifelong injury or disfigurement; 6=potential for severe permanent harm; and 7=potential for death<italic>.</italic></p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Summary of Findings</title><p>This study was the first to systematically compare initial senior physician ward round records written by DeepSeek, ChatGPT, and physicians for patients with acute poisoning using real clinical progress notes and attending physician records as structured input. By grounding the evaluation in authentic clinical data rather than simulated scenarios, this design may offer greater ecological validity than studies using hypothetical vignettes, although the single-center retrospective nature limits broader generalizability.</p><p>Overall, DeepSeek achieved the highest total score, significantly outperforming both ChatGPT and the physician group. This finding demonstrates that LLMs have acquired a basic capability for assisting in medical documentation generation and that the domestic model DeepSeek performs comparably or superiorly to the mainstream international model ChatGPT&#x2014;providing empirical support for its application in Chinese clinical settings. With advantages including free access, absence of data export requirements, and strong alignment with Chinese medical documentation conventions, DeepSeek has broad potential in domestic medical documentation assistance [<xref ref-type="bibr" rid="ref13">13</xref>].</p></sec><sec id="s4-2"><title>Dimension-Level Performance</title><p>The LLMs&#x2014;particularly DeepSeek&#x2014;demonstrated outstanding performance in case characteristic summarization and differential diagnosis, reflecting their powerful information extraction and structured knowledge retrieval capabilities [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. In the differential diagnosis dimension, DeepSeek achieved near-perfect scores (mean 4.98, SD 0.10), comprehensively listing diseases requiring differentiation and providing sufficient differentiating evidence. However, in current diagnosis&#x2014;a domain requiring integrative clinical judgment&#x2014;the physician group maintained a significant advantage (mean 4.86, SD 0.30). Accurate clinical diagnosis depends not only on standardized medical knowledge but also on the synthesis of individual patient circumstances, recognition of atypical presentations, dynamic tracking of disease evolution, and the weighing of complex comorbidities. This capacity for contextual clinical reasoning, built through accumulated experience, represents a core domain in which current LLMs cannot fully substitute trained clinicians [<xref ref-type="bibr" rid="ref19">19</xref>]. Notably, GPT-5.1 demonstrated a pattern of placing the correct diagnosis within the differential diagnosis list rather than as the primary diagnosis [<xref ref-type="bibr" rid="ref20">20</xref>]&#x2014;a finding consistent with results reported for GPT-4 by McDuff et al [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>Both LLMs scored significantly higher than the physician group in prognosis assessment. This may reflect the models&#x2019; tendency to generate more comprehensive assessment frameworks encompassing risk factors and outcome scenarios [<xref ref-type="bibr" rid="ref22">22</xref>], or it may be attributable to clinicians simplifying prognosis documentation under time pressure in routine practice. Regardless, the clinical accuracy of LLM-generated prognostic content requires prospective validation against actual patient outcomes before it can be relied upon clinically.</p></sec><sec id="s4-3"><title>Subgroup Performance by Poisoning Type</title><p>Subgroup analysis revealed heterogeneity in LLM performance, with advantages concentrated in categories characterized by standardized information and well-established management protocols. In drug poisoning&#x2014;the most common category&#x2014;DeepSeek performed most prominently, consistent with the hypothesis that structured, protocol-amenable scenarios favor LLM pattern matching and knowledge extraction.</p><p>In pesticide and biological toxin poisoning cases, ChatGPT scored significantly lower than both DeepSeek and the physician group, whereas DeepSeek and physicians performed comparably. Pesticide and biological toxin poisonings (such as wasp stings) are more prevalent in China and involve complex toxicological mechanisms, diverse clinical presentations, and individualized treatment strategies. DeepSeek&#x2019;s maintained performance in these categories may reflect its deeper integration with domestic toxicology databases, local clinical guidelines, and specialized knowledge during training.</p><p>In gas poisoning (primarily carbon monoxide poisoning), no significant differences were observed among groups, possibly reflecting the highly standardized treatment principles for this category (eg, hyperbaric oxygen therapy) and the relatively small subgroup sample size. In the heterogeneous &#x201C;other poisoning&#x201D; category, the absence of significant differences suggests that current LLMs may have limitations in handling complex, atypical cases. These findings reinforce the recommendation that LLMs function as documentation assistants, with all generated records subject to systematic review by senior physicians before clinical use.</p></sec><sec id="s4-4"><title>Comparison With Existing Literature</title><p>The findings of this study are broadly consistent with those of Schwieger et al [<xref ref-type="bibr" rid="ref23">23</xref>], confirming that LLMs possess significant advantages in medical documentation standardization and efficiency. Unlike prior studies focused on radiology reports and discharge summaries [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>], this study specifically addressed emergency department ward round records, extending the evidence on LLM applicability to critical care documentation. Consistent with Hains et al [<xref ref-type="bibr" rid="ref27">27</xref>], who identified performance gaps when processing real electronic medical record data, our study found suboptimal LLM performance in complex and atypical poisoning categories. The finding that fine-tuned models may outperform general models in specific diagnostic tasks [<xref ref-type="bibr" rid="ref18">18</xref>] further suggests that domain-specific adaptation could improve clinical performance in specialized fields such as toxicology.</p><p>Regarding safety, existing evidence indicates that LLM-drafted discharge summaries have comparable overall quality and safety to those of physician-written records [<xref ref-type="bibr" rid="ref9">9</xref>]; however, high-harm record rates in this study were higher for LLMs than for physicians, suggesting that ward round records&#x2014;which encode more complex clinical reasoning than discharge summaries&#x2014;may require additional scrutiny. This is consistent with broader evidence that LLM limitations are more pronounced in complex clinical decision-making contexts [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec><sec id="s4-5"><title>Safety Considerations</title><p>Although overall potential harm scores were low across all 3 groups, the significantly higher proportion of high-harm records (&#x2265;4 points) in the LLM groups&#x2014;3.1% (8/256) for DeepSeek and 2% (5/256) for ChatGPT vs 0.8% (2/256) for the physician group (<italic>&#x03C7;</italic><sup>2</sup>=7.8; <italic>P</italic>=.02)&#x2014;carries important clinical safety implications. High-harm errors included systematic PSS score overestimation leading to overtreatment risk, inappropriate downgrading of nursing care levels for patients who were critically ill, and omission of critical physical examination findings. These may reflect LLM overconfidence in atypical cases, provision of definitive recommendations without acknowledging diagnostic uncertainty, or fabrication errors at critical decision points such as antidote dosing or nursing level assignment.</p><p>These findings underscore the irreplaceable role of physician review in human&#x2013;artificial intelligence collaboration models. We recommend the following safety mechanisms for clinical implementation: (1) mandatory senior physician review of all LLM-generated ward round records prior to incorporation into official documentation, (2) enhanced review protocols for critically ill patients and complex poisoning types, and (3) establishment of ongoing error monitoring and feedback systems to enable continuous optimization of model application. Notably, although physician-written records had the lowest proportion of high-harm errors, their occurrence (2/256, 0.8%) confirms that documentation quality control is necessary regardless of record authorship, primarily to address copy-paste errors and dosage unit confusion.</p></sec><sec id="s4-6"><title>Limitations</title><p>Several limitations warrant consideration. First, this was a single-center retrospective study; the findings may not generalize to institutions with different patient populations, clinical workflows, or regional epidemiological profiles. Second, the retrospective design precluded evaluation of actual documentation time or work efficiency improvements, which are clinically important considerations given the timeliness demands of emergency ward round documentation. Third, patient outcome data were not linked, so the clinical impact of documentation errors could not be directly assessed. Fourth, sample sizes in some subgroups were relatively small, potentially limiting statistical power. Fifth, the 2 LLMs were accessed via different methods&#x2014;API for DeepSeek and web interface for ChatGPT owing to API unavailability&#x2014;which may have introduced minor differences in output consistency. Sixth, the conservative temperature setting (0.3) for DeepSeek may have constrained output diversity; the effects of alternative parameter configurations were not explored. Seventh, the number of high-harm records was small, limiting in-depth analysis of associated risk factors. Future studies should address these limitations through multicenter prospective designs, time efficiency measurements, outcome linkage, and investigation of different model parameter settings. Additionally, the inference that language differences minimally impact LLM output [<xref ref-type="bibr" rid="ref31">31</xref>] was drawn from medical examination studies using an earlier ChatGPT version, and its applicability to real-world clinical notes with the current model versions should be interpreted cautiously.</p></sec><sec id="s4-7"><title>Conclusions</title><p>LLMs demonstrated satisfactory performance in generating initial senior physician ward round records in the emergency department, with DeepSeek particularly outperforming the physician group in case characteristic integration and differential diagnosis. However, physician-level performance was not achieved in core clinical diagnosis, and the significantly higher proportion of high-harm records in LLM-generated documentation underscores the necessity of rigorous physician review mechanisms in any clinical implementation. These findings position LLMs as documentation assistants rather than independent clinical authors, with the potential to augment physician efficiency without supplanting clinical judgment. Realizing this potential safely will require standardized quality control frameworks, including mandatory senior physician review, heightened oversight for complex or critically ill cases, and continuous monitoring of high-harm errors. Future research should prioritize multicenter prospective studies, domain-specific model fine-tuning for clinical toxicology, integration of time efficiency metrics, and the development of validated quality standards for LLM-generated medical records in emergency settings.</p></sec></sec></body><back><ack><p>The authors would like to express their sincere gratitude to the emergency medicine team for their professionalism and dedication and to the patients and their families for their trust and cooperation throughout the treatment process. The authors declare that generative artificial intelligence tools were not used in any portion of manuscript preparation, including writing, editing, data analysis, or figure generation. The generative artificial intelligence tools (DeepSeek-V3.2-Exp and GPT-5.1 [OpenAI]) were used solely as the study intervention, as described in the Methods section.</p></ack><notes><sec><title>Funding</title><p>This research was supported by the open research fund of Hubei Provincial Clinical Research Center for Pneumoconiosis and Poisoning (HCRCP2025B04 and HCRCP2025B06), the Graduate Innovation Project of Hubei University of Medicine (YC202542), the Graduate Education and Teaching Research Project of Hubei University of Medicine (YJ2024018), and the Guiding Project of Shiyan Science and Technology Bureau in Hubei Province (24Y044).</p></sec><sec><title>Data Availability</title><p>The data supporting the findings of this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JZ and WP led the development of problem input models, data collection, manuscript preparation, data processing, and statistical analysis. YW, KY, and ZF were responsible for English translation and editing. XY conceptualized the project and supervised the design, revision, and review of all aspects of the study.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gummin</surname><given-names>DD</given-names> </name><name name-style="western"><surname>Mowry</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Beuhler</surname><given-names>MC</given-names> </name><etal/></person-group><article-title>2020 annual report of the American Association of Poison Control Centers&#x2019; National Poison Data System (NPDS): 38th annual report</article-title><source>Clin Toxicol (Phila)</source><year>2021</year><month>12</month><volume>59</volume><issue>12</issue><fpage>1282</fpage><lpage>1501</lpage><pub-id pub-id-type="doi">10.1080/15563650.2021.1989785</pub-id><pub-id pub-id-type="medline">34890263</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kind</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>MA</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Henriksen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Battles</surname><given-names>JB</given-names> </name><name name-style="western"><surname>Keyes</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Grady</surname><given-names>ML</given-names> </name></person-group><article-title>Documentation of mandated discharge summary components in transitions from acute to subacute care</article-title><source>Advances in Patient Safety: New Directions and Alternative Approaches</source><year>2008</year><access-date>2026-06-18</access-date><publisher-name>Agency for Healthcare Research and Quality</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK43715/">https://www.ncbi.nlm.nih.gov/books/NBK43715/</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Momenipur</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pennathur</surname><given-names>PR</given-names> </name></person-group><article-title>Balancing documentation and direct patient care activities: a study of a mature electronic health record system</article-title><source>Int J Ind Ergon</source><year>2019</year><month>07</month><volume>72</volume><fpage>338</fpage><lpage>346</lpage><pub-id pub-id-type="doi">10.1016/j.ergon.2019.06.012</pub-id><pub-id pub-id-type="medline">32201437</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Casalino</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Khullar</surname><given-names>D</given-names> </name></person-group><article-title>Deep learning in medicine-promise, progress, and challenges</article-title><source>JAMA Intern Med</source><year>2019</year><month>03</month><day>1</day><volume>179</volume><issue>3</issue><fpage>293</fpage><lpage>294</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2018.7117</pub-id><pub-id pub-id-type="medline">30556825</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Topol</surname><given-names>EJ</given-names> </name></person-group><article-title>High-performance medicine: the convergence of human and artificial intelligence</article-title><source>Nat Med</source><year>2019</year><month>01</month><volume>25</volume><issue>1</issue><fpage>44</fpage><lpage>56</lpage><pub-id pub-id-type="doi">10.1038/s41591-018-0300-7</pub-id><pub-id pub-id-type="medline">30617339</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garcia</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Artificial intelligence-generated draft replies to patient inbox messages</article-title><source>JAMA Netw Open</source><year>2024</year><month>03</month><day>4</day><volume>7</volume><issue>3</issue><fpage>e243201</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.3201</pub-id><pub-id pub-id-type="medline">38506805</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Clinical management of wasp stings using large language models: cross-sectional evaluation study</article-title><source>J Med Internet Res</source><year>2025</year><month>06</month><day>4</day><volume>27</volume><fpage>e67489</fpage><pub-id pub-id-type="doi">10.2196/67489</pub-id><pub-id pub-id-type="medline">40466102</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Subramanian</surname><given-names>CR</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>SS</given-names> </name><etal/></person-group><article-title>Physician- and large language model-generated hospital discharge summaries</article-title><source>JAMA Intern Med</source><year>2025</year><month>07</month><day>1</day><volume>185</volume><issue>7</issue><fpage>818</fpage><lpage>825</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2025.0821</pub-id><pub-id pub-id-type="medline">40323616</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00FC;n</surname><given-names>M</given-names> </name></person-group><article-title>AI-assisted blood gas interpretation: a comparative study with an emergency physician</article-title><source>Am J Emerg Med</source><year>2025</year><month>08</month><volume>94</volume><fpage>1</fpage><lpage>2</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2025.04.028</pub-id><pub-id pub-id-type="medline">40252296</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hiebel</surname><given-names>N</given-names> </name><name name-style="western"><surname>Ferret</surname><given-names>O</given-names> </name><name name-style="western"><surname>Fort</surname><given-names>K</given-names> </name><name name-style="western"><surname>N&#x00E9;v&#x00E9;ol</surname><given-names>A</given-names> </name></person-group><article-title>Clinical text generation: are we there yet?</article-title><source>Annu Rev Biomed Data Sci</source><year>2025</year><month>08</month><volume>8</volume><issue>1</issue><fpage>173</fpage><lpage>198</lpage><pub-id pub-id-type="doi">10.1146/annurev-biodatasci-103123-095202</pub-id><pub-id pub-id-type="medline">40101215</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clough</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Sparkes</surname><given-names>WA</given-names> </name><name name-style="western"><surname>Clough</surname><given-names>OT</given-names> </name><name name-style="western"><surname>Sykes</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Steventon</surname><given-names>AT</given-names> </name><name name-style="western"><surname>King</surname><given-names>K</given-names> </name></person-group><article-title>Transforming healthcare documentation: harnessing the potential of AI to generate discharge summaries</article-title><source>BJGP Open</source><year>2024</year><month>04</month><volume>8</volume><issue>1</issue><fpage>BJGPO.2023.0116</fpage><pub-id pub-id-type="doi">10.3399/BJGPO.2023.0116</pub-id><pub-id pub-id-type="medline">37699649</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>Y</given-names> </name></person-group><article-title>Performance of DeepSeek-R1 and ChatGPT-4o on the Chinese National Medical Licensing Examination: a comparative study</article-title><source>J Med Syst</source><year>2025</year><month>06</month><day>3</day><volume>49</volume><issue>1</issue><fpage>74</fpage><pub-id pub-id-type="doi">10.1007/s10916-025-02213-z</pub-id><pub-id pub-id-type="medline">40459679</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>F</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Strategies for the analysis and elimination of hallucinations in artificial intelligence generated medical knowledge</article-title><source>J Evid Based Med</source><year>2025</year><month>09</month><volume>18</volume><issue>3</issue><fpage>e70075</fpage><pub-id pub-id-type="doi">10.1111/jebm.70075</pub-id><pub-id pub-id-type="medline">40983876</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>T</given-names> </name><name name-style="western"><surname>Szekendi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pavkovic</surname><given-names>S</given-names> </name><name name-style="western"><surname>Clevenger</surname><given-names>W</given-names> </name><name name-style="western"><surname>Cerese</surname><given-names>J</given-names> </name></person-group><article-title>The reliability of AHRQ Common Format Harm Scales in rating patient safety events</article-title><source>J Patient Saf</source><year>2015</year><month>03</month><volume>11</volume><issue>1</issue><fpage>52</fpage><lpage>59</lpage><pub-id pub-id-type="doi">10.1097/PTS.0b013e3182948ef9</pub-id><pub-id pub-id-type="medline">24080718</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Van Veen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Van Uden</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blankemeier</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Adapted large language models can outperform medical experts in clinical text summarization</article-title><source>Nat Med</source><year>2024</year><month>04</month><volume>30</volume><issue>4</issue><fpage>1134</fpage><lpage>1142</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-02855-5</pub-id><pub-id pub-id-type="medline">38413730</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>C</given-names> </name></person-group><article-title>Leveraging generative AI and large language models: a comprehensive roadmap for healthcare integration</article-title><source>Healthcare (Basel)</source><year>2023</year><month>10</month><day>20</day><volume>11</volume><issue>20</issue><fpage>2776</fpage><pub-id pub-id-type="doi">10.3390/healthcare11202776</pub-id><pub-id pub-id-type="medline">37893850</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cai</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xiao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>X</given-names> </name></person-group><article-title>Train-time and test-time computation in large language models for error detection and correction in electronic medical records: a retrospective study</article-title><source>Diagnostics (Basel)</source><year>2025</year><month>07</month><day>21</day><volume>15</volume><issue>14</issue><fpage>1829</fpage><pub-id pub-id-type="doi">10.3390/diagnostics15141829</pub-id><pub-id pub-id-type="medline">40722578</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Reese</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Danis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Caufield</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>On the limitations of large language models in clinical diagnosis</article-title><source>medRxiv</source><year>2024</year><month>02</month><day>26</day><fpage>2023.07.13.23292613</fpage><pub-id pub-id-type="doi">10.1101/2023.07.13.23292613</pub-id><pub-id pub-id-type="medline">37503093</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McDuff</surname><given-names>D</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Towards accurate differential diagnosis with large language models</article-title><source>Nature</source><year>2025</year><month>06</month><volume>642</volume><issue>8067</issue><fpage>451</fpage><lpage>457</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-08869-4</pub-id><pub-id pub-id-type="medline">40205049</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chung</surname><given-names>P</given-names> </name><name name-style="western"><surname>Fong</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Aghaeepour</surname><given-names>N</given-names> </name><name name-style="western"><surname>Yetisgen</surname><given-names>M</given-names> </name><name name-style="western"><surname>O&#x2019;Reilly-Shah</surname><given-names>VN</given-names> </name></person-group><article-title>Large language model capabilities in perioperative risk prediction and prognostication</article-title><source>JAMA Surg</source><year>2024</year><month>08</month><day>1</day><volume>159</volume><issue>8</issue><fpage>928</fpage><lpage>937</lpage><pub-id pub-id-type="doi">10.1001/jamasurg.2024.1621</pub-id><pub-id pub-id-type="medline">38837145</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schwieger</surname><given-names>A</given-names> </name><name name-style="western"><surname>Angst</surname><given-names>K</given-names> </name><name name-style="western"><surname>de Bardeci</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models can support generation of standardized discharge summaries - a retrospective study utilizing ChatGPT-4 and electronic health records</article-title><source>Int J Med Inform</source><year>2024</year><month>12</month><volume>192</volume><fpage>105654</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105654</pub-id><pub-id pub-id-type="medline">39437512</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>F</given-names> </name><name name-style="western"><surname>Hong</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A comparative study of recent large language models on generating hospital discharge summaries for lung cancer patients</article-title><source>J Biomed Inform</source><year>2025</year><month>08</month><volume>168</volume><fpage>104867</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2025.104867</pub-id><pub-id pub-id-type="medline">40544901</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Janota</surname><given-names>B</given-names> </name><name name-style="western"><surname>Janota</surname><given-names>K</given-names> </name></person-group><article-title>Application of artificial intelligence (AI) in the creation of discharge summaries in psychiatric clinics</article-title><source>Int J Psychiatry Med</source><year>2025</year><month>05</month><volume>60</volume><issue>3</issue><fpage>330</fpage><lpage>337</lpage><pub-id pub-id-type="doi">10.1177/00912174241284730</pub-id><pub-id pub-id-type="medline">39285727</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zaretsky</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Baskharoun</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Generative artificial intelligence to transform inpatient discharge summaries to patient-friendly language and format</article-title><source>JAMA Netw Open</source><year>2024</year><month>03</month><day>4</day><volume>7</volume><issue>3</issue><fpage>e240357</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.0357</pub-id><pub-id pub-id-type="medline">38466307</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hains</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kleinig</surname><given-names>O</given-names> </name><name name-style="western"><surname>Murugappa</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Large language model discharge summary preparation using real-world electronic medical record data shows promise</article-title><source>Intern Med J</source><year>2025</year><month>05</month><day>28</day><volume>55</volume><issue>7</issue><fpage>1188</fpage><lpage>1192</lpage><pub-id pub-id-type="doi">10.1111/imj.70073</pub-id><pub-id pub-id-type="medline">40434141</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Hsieh</surname><given-names>PH</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>YC</given-names> </name></person-group><article-title>Performance comparison of junior residents and ChatGPT in the Objective Structured Clinical Examination (OSCE) for medical history taking and documentation of medical records: development and usability study</article-title><source>JMIR Med Educ</source><year>2024</year><month>11</month><day>21</day><volume>10</volume><fpage>e59902</fpage><pub-id pub-id-type="doi">10.2196/59902</pub-id><pub-id pub-id-type="medline">39622713</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adler-Milstein</surname><given-names>J</given-names> </name><name name-style="western"><surname>Redelmeier</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Wachter</surname><given-names>RM</given-names> </name></person-group><article-title>The limits of clinician vigilance as an AI safety bulwark</article-title><source>JAMA</source><year>2024</year><month>04</month><day>9</day><volume>331</volume><issue>14</issue><fpage>1173</fpage><lpage>1174</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.3620</pub-id><pub-id pub-id-type="medline">38483397</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chenais</surname><given-names>G</given-names> </name><name name-style="western"><surname>Lagarde</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gil-Jardin&#x00E9;</surname><given-names>C</given-names> </name></person-group><article-title>Artificial intelligence in emergency medicine: viewpoint of current applications and foreseeable opportunities and challenges</article-title><source>J Med Internet Res</source><year>2023</year><month>05</month><day>23</day><volume>25</volume><fpage>e40031</fpage><pub-id pub-id-type="doi">10.2196/40031</pub-id><pub-id pub-id-type="medline">36972306</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Dou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name></person-group><article-title>Performance and exploration of ChatGPT in medical examination, records and education in Chinese: pave the way for medical AI</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105173</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105173</pub-id><pub-id pub-id-type="medline">37549499</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Clinical characteristics of the 256 included acute poisoning patients.</p><media xlink:href="jmir_v28i1e91222_app1.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material></app-group></back></article>