<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e86841</article-id><article-id pub-id-type="doi">10.2196/86841</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Error Detection in Emergency Radiology Reports Using a Large Language Model: Multistage Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Shen</surname><given-names>Hui</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Tianyang</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Fei</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fang</surname><given-names>Jin</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Yuange</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Xiaoling</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Shuyi</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Liting</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ren</surname><given-names>Qiuping</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Meng</surname><given-names>Xiao</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Jiatong</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sun</surname><given-names>Jie</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhao</surname><given-names>Yujie</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Xin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Liaoyuan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mai</surname><given-names>Guipeng</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>You</surname><given-names>Jingjing</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jin</surname><given-names>Zhe</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wu</surname><given-names>Xuewei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>He</surname><given-names>Wenle</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Han</surname><given-names>Xue</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Shuixing</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zeng</surname><given-names>Dong</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Zhang</surname><given-names>Bin</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Radiology, The First Affiliated Hospital of Jinan University</institution><addr-line>No. 613 Huangpu West Road, Tianhe</addr-line><addr-line>Guangzhou</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><aff id="aff2"><institution>School of Biomedical Engineering, Southern Medical University</institution><addr-line>Guangzhou</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Radiology, The Affiliated Hospital of Guangdong Medical University</institution><addr-line>Zhanjiang</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><aff id="aff4"><institution>Department of Radiology, Guangzhou Women and Children's Medical Center, Guangzhou Medical University</institution><addr-line>Guangzhou</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><aff id="aff5"><institution>Department of Radiology, Nanfang Hospital, Southern Medical University</institution><addr-line>Guangzhou</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><aff id="aff6"><institution>Department of Radiology, Longhu District People's Hospital of Shantou</institution><addr-line>Shantou</addr-line><addr-line>Guangdong</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Jin</surname><given-names>Qiao</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Okolie</surname><given-names>Awele</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Akinpeloye</surname><given-names>Olajide</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Jiang</surname><given-names>Shan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Bin Zhang, MD, PhD, Department of Radiology, The First Affiliated Hospital of Jinan University, No. 613 Huangpu West Road, Tianhe, Guangzhou, Guangdong, 510630, China, 86 15217921427; <email>xld_jane_eyre@126.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>14</day><month>4</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e86841</elocation-id><history><date date-type="received"><day>31</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>05</day><month>02</month><year>2026</year></date><date date-type="accepted"><day>11</day><month>02</month><year>2026</year></date></history><copyright-statement>&#x00A9; Hui Shen, Tianyang Wu, Fei Wang, Jin Fang, Yuange Li, Xiaoling Wu, Shuyi Liu, Liting Chen, Qiuping Ren, Xiao Meng, Jiatong Xu, Jie Sun, Yujie Zhao, Xin Liu, Liaoyuan Wang, Guipeng Mai, Jingjing You, Zhe Jin, Xuewei Wu, Wenle He, Xue Han, Shuixing Zhang, Dong Zeng, Bin Zhang. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 14.4.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e86841"/><abstract><sec><title>Background</title><p>Emergency radiology requires highly accurate reporting under time constraints; yet, increasing workloads raise the risk of errors. While large language models (LLMs) show potential for proofreading in general radiology, their performance in emergency settings and non-English contexts remains unclear.</p></sec><sec><title>Objective</title><p>We aim to evaluate the performance of a domain-optimized LLM, DeepSeek-R1, for identifying errors in Chinese emergency radiology reports, with comparison against assessments by board-certified radiologists.</p></sec><sec sec-type="methods"><title>Methods</title><p>We compiled 7435 emergency reports (dataset 1; radiography, computed tomography, and magnetic resonance imaging) collected from November 2024 to April 2025. In stage 1, a total of 5 LLMs were evaluated using 200 reports. The best model, DeepSeek-R1, proceeded to stages 2 and 3, where 0-shot and few-shot learning were tested on a separate set (n=100). Model performance was compared against 12 radiologists. Stage 4 validated real-world utility on 800 verified reports.</p></sec><sec sec-type="results"><title>Results</title><p>In subdataset 1, under stress-testing conditions, DeepSeek-R1 achieved a higher error detection rate in the few-shot setting than in the 0-shot setting (84.4% vs 60.9%, <italic>P</italic>=.003). Its performance exceeded that of radiology residents (84.4% vs 51.6% and 53.1%, respectively; both <italic>P</italic>&#x003C;.05) and showed no statistically significant difference compared with senior radiologists and attending radiologists (84.4% vs 68.8%&#x2010;93.8%, <italic>P</italic>=.26 to &#x2265;.99). Compared with residents, DeepSeek-R1 detected more critical omissions (100% vs 25% and 50%; both <italic>P</italic>&#x003C;.05) and other errors (92% vs 33% and 33%; both <italic>P</italic>=.02). In dataset 2, collected from independent institutions, DeepSeek-R1 achieved a detection rate of 95% under the few-shot setting. Reading time was shorter than that of human readers (92 vs 109 s). In real-world validation, DeepSeek-R1 identified 117 true reporting errors, yielding a positive predictive value of 56.5%.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>DeepSeek-R1 holds promise for improving quality control in emergency radiology reports. Its performance and efficiency support its use as an assistive proofreading tool in real-world radiology workflows.</p></sec></abstract><kwd-group><kwd>emergency radiology</kwd><kwd>large language models</kwd><kwd>error detection</kwd><kwd>quality control</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Emergency radiology reports are critical for guiding timely and accurate patient management, particularly in trauma, acute illness, and other time-sensitive clinical scenarios. Errors in these reports, ranging from factual inaccuracies (eg, laterality confusion) to interpretive inconsistencies, can directly impact patient outcomes, leading to misdiagnoses, delayed treatments, or inappropriate management [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. The growing volume of emergency cases, coupled with radiologist shortages and high workloads, further increases the likelihood of mistakes [<xref ref-type="bibr" rid="ref4">4</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. Traditional proofreading methods, such as double-reading by senior radiologists, are effective but time-consuming and often impractical in fast-paced emergency settings [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Recent advances in large language models (LLMs) provide promising solutions for automating error detection [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. Studies have shown that LLMs such as GPT-4 (OpenAI Inc) and Claude 3.5 Sonnet (Anthropic PBC) can identify inconsistencies in radiology reports with accuracy comparable to human experts [<xref ref-type="bibr" rid="ref11">11</xref>-<xref ref-type="bibr" rid="ref13">13</xref>]. For instance, GPT-4 achieved an error detection rate of 82.7% in general radiology reports, comparable to senior radiologists (89.3%) while reducing reading time from 25.1 to 3.5 seconds per report [<xref ref-type="bibr" rid="ref12">12</xref>]. Similarly, Claude 3.5 Sonnet outperformed radiologists in detecting factual errors in head computed tomography (CT) reports, with a sensitivity of 89% compared to 33%&#x2010;69% among human readers [<xref ref-type="bibr" rid="ref13">13</xref>]. In Chinese ultrasound reports, LLMs show promise in identifying spelling and logical errors, with Claude 3.5 Sonnet achieving a 52.3% detection rate in 0-shot settings [<xref ref-type="bibr" rid="ref14">14</xref>]. Despite these advances, several critical gaps remain. First, LLM performance in emergency radiology, a domain characterized by fragmented information, urgent decision-making, and complex multimodal findings, remains unexplored. Second, most studies used synthetically generated errors or single-institution data<bold>,</bold> limiting generalizability to real-world emergency scenarios [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Third, while GPT-4 and Claude show strengths in English contexts, their efficacy in non-English emergency reports (eg, Chinese) is suboptimal due to linguistic nuances and terminology variations [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. DeepSeek-R1, a state-of-the-art LLM specifically optimized for Chinese clinical text, offers a valuable opportunity to address these limitations [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Its architecture integrates domain-specific pretraining on multilingual clinical corpora, potentially enhancing error detection in non-English emergency reports [<xref ref-type="bibr" rid="ref19">19</xref>]. However, rigorous validation using real-world emergency radiology data is still lacking.</p><p>Therefore, this study aims to evaluate DeepSeek-R1&#x2019;s ability to identify errors in real-world Chinese emergency radiology reports, benchmark its performance against both other LLMs and radiologists of varying experience levels, and assess its potential for integration into clinical workflows. With a focus on practical applicability, this work strives to establish a new benchmark for artificial intelligence (AI)&#x2013;assisted quality assurance in emergency radiology.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This retrospective study was approved by the Ethics Review Board of the First Affiliated Hospital of Jinan University (20250306). The requirement for informed consent was waived due to this study&#x2019;s retrospective nature. All patient data used during this study were strictly anonymized. No personally identifiable information was disclosed to any LLMs, thereby ensuring patient privacy and adherence to ethical standards. Participants received no compensation.</p></sec><sec id="s2-2"><title>Dataset and Error Categories</title><p>A total of 7435 medical records between November 2024 and April 2025 were collected from the First Affiliated Hospital of Jinan University. The dataset 1 consisted of 3 types of emergency radiology reports: CT (n=5237), magnetic resonance imaging (MRI; n=381), and radiography (n=1817). Each report included 4 sections: patient information, examination items, findings, and impression. To increase the diversity of the dataset, dataset 2 consisted of 50 error-free emergency radiology reports that passed quality control and 50 erroneous reports that did not meet quality control, collected from 2 hospitals: Nanfang Hospital, Southern Medical University, and the Affiliated Hospital of Guangdong Medical University. Both unreviewed and senior physician-reviewed versions are available for each record. All patient identifiers were anonymized before inclusion in the dataset.</p><p>First, a total of 200 error-free reports were randomly selected from dataset 1 to form a subdataset 1 using a freely available research data randomization tool [<xref ref-type="bibr" rid="ref20">20</xref>], all of which were verified by human experts to ensure accuracy. These reports served as the basis for evaluating the error detection capabilities of both LLMs and physicians. Then, artificial errors were introduced into 100 of these reports through a randomized process. Informed by previous studies on common error types in radiology reports [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], the error categories included (1) omission: the omission of relevant words or phrases, including both deletions and missing words (eg, &#x201C;fracture&#x201D; instead of &#x201C;no fracture&#x201D;); (2) insertion: the unintentional insertion of incorrect words or phrases, including inappropriate terms, substitutions, insertions, or word confusions (eg, &#x201C;abnormal&#x201D; instead of &#x201C;normal&#x201D;); (4) spelling: Chinese spelling errors that may arise from rapid typing using Pinyin input methods (eg, &#x201C;&#x7EB5;&#x9694;&#x201D; misspelled as &#x201C;&#x7EB5;&#x8188;,&#x201D; &#x201C;&#x690E;&#x4F53;&#x201D; misspelled as &#x201C;&#x9525;&#x4F53;&#x201D;); (5) side confusion: laterality errors (eg, &#x201C;right&#x201D; instead of &#x201C;left,&#x201D; &#x201C;lateral&#x201D; instead of &#x201C;medial&#x201D;); and (6) other errors: errors not fitting the above categories, such as incorrect dates, errors in image or series numbering, unit discrepancies (eg, centimeter vs millimeter), template-related inaccuracies, and punctuation errors. Error definitions, applications, and distribution of each error type are detailed in <xref ref-type="table" rid="table1">Table 1</xref>, and Figures S1 and S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Types of errors, their descriptions, and examples used in this study. The examples presented in the table are actual errors identified by DeepSeek-R1 from original emergency radiology reports of the First Affiliated Hospital of Jinan University, with the erroneous portions shown in italics.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Error type</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom" colspan="2">Example</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top">Finding</td><td align="left" valign="top">Impression</td></tr></thead><tbody><tr><td align="left" valign="top">Omission</td><td align="left" valign="top">The omission of relevant words or phrases, including both deletions and missing words</td><td align="left" valign="top">There is no evidence of hemorrhage, edema, mass effect, or infarction. &#x201C;<italic>The visualized bilateral maxillary sinuses show mucosal thickening with patchy fluid density shadows within.</italic>&#x201D;</td><td align="left" valign="top"><italic>Normal study</italic></td></tr><tr><td align="left" valign="top">Insertion</td><td align="left" valign="top">The unintentional insertion of incorrect words or phrases, including inappropriate terms, substitutions, insertions, or word confusions</td><td align="left" valign="top">There is a fracture at the inferior pole of the patella with displacement of the larger fragment inferiorly and anteriorly, and displacement of a smaller bony fragment inferiorly. Alignment within the right knee joint is appropriate. No further pathologic step-offs or interruptions of the cortex are delineated. The bone trabeculae are homogeneously structured.</td><td align="left" valign="top"><italic>Nondisplaced</italic> fracture at the inferior pole of the patella.</td></tr><tr><td align="left" valign="top">Spelling</td><td align="left" valign="top">Chinese spelling errors that may arise from rapid typing using Pinyin input methods</td><td align="left" valign="top">Disruption of cortical continuity is noted in the distal phalanx of the left <italic>hallux</italic> (<italic>&#x62C7;&#x8DBE;</italic>)<italic>,</italic> with an identifiable transverse or radiolucent fracture line.</td><td align="left" valign="top">Mildly displaced fracture of the distal phalanx of the left <italic>hallux</italic> (<italic>&#x62C7;&#x6307;</italic>), <italic>which is a spelling error in Chinese.</italic></td></tr><tr><td align="left" valign="top">Side confusion</td><td align="left" valign="top">Laterality errors</td><td align="left" valign="top">Adjacent to the middle phalanx of the <italic>left</italic> fifth finger, there is a small, well-defined osseous density fragment.</td><td align="left" valign="top">Small osseous density adjacent to the <italic>right</italic> fifth middle phalanx, suspicious for an avulsion fracture.</td></tr><tr><td align="left" valign="top">Other errors</td><td align="left" valign="top">Errors not fitting the above categories, such as incorrect dates, errors in image or series numbering, unit discrepancies, template-related inaccuracies, and punctuation errors</td><td align="left" valign="top">The liver demonstrates smooth contours and normal proportions. Multiple rounds of low-attenuation lesions are present within the hepatic parenchyma, with well-defined margins. The largest lesion measures approximately <italic>1.2 cm</italic> in diameter.</td><td align="left" valign="top">Multiple low-attenuation hepatic lesions, the largest measuring approximately <italic>1.2 mm</italic>.</td></tr></tbody></table></table-wrap><p>Clinical experts methodically implanted 127 errors into the modified reports, with no more than 2 errors per report. Therefore, a comparative dataset of 100 error-free and 100 erroneous reports was established to evaluate the diagnostic discrepancy detection capabilities of both LLMs and radiologists. Only the errors intentionally inserted into the text were used as the reference standard. To ensure the absence of any additional errors in the radiology reports, 3 readers (HS, XW, and BZ, with 5, 5, and 8 years of experience, respectively) independently reviewed the reports. Discrepancies were resolved via consensus review among 3 readers.</p><p>Two types of prompts were developed for the LLMs: 0-shot prompts and few-shot prompts. The few-shot prompt was constructed by incorporating 6 additional emergency radiology report samples into the 0-shot prompt framework, while maintaining consistency across all other prompt components. These 6 example reports were randomly selected from the pool of emergency radiology reports, with deliberate attention to ensuring diagnostic diversity and coverage of common error types, including item omission, insertion, spelling errors, laterality confusion, and other frequently encountered reporting mistakes. This stratified yet random selection strategy was intended to reflect the variability of real-world radiology reporting without biasing the model toward any specific diagnosis or error category. The anonymized example reports used for few-shot prompting are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Each prompt was structured into three main sections: (1) role context and task description, (2) definitions of report error types, and (3) output constraints (<xref ref-type="table" rid="table2">Table 2</xref>). In the 0-shot setting, the prompt was specifically designed with an emphasis on Chinese medical terminology to ensure alignment with the linguistic and domain-specific characteristics of emergency radiology reports in clinical practice. While this domain-specific tailoring was intended to improve task relevance, we acknowledge that it may introduce a performance advantage for DeepSeek-R1, which has been trained on corpora with similar linguistic and clinical characteristics.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Detailed prompts and parameters of large language models used in this study.</p></caption><table id="table2" frame="hsides" rules="groups"><tbody><tr><td align="left" valign="top">Prompt name</td><td align="left" valign="top">Prompt text</td></tr><tr><td align="left" valign="top">0-shot prompt</td><td align="left" valign="top">You are a radiology physician. In the subsequent text, I will provide you with an emergency radiology report containing the following three sections: &#x201C;Examination details&#x201D; &#x201C;Findings&#x201D; and &#x201C;Impression&#x201D;, please assess the report for potential inaccuracies. If you identify any of the following error types, please address the corresponding issues.<break/>Omission: Discrepancies between imaging findings and imaging diagnosis that may lead to ambiguity, missed diagnosis, or misdiagnosis. For example, a positive lesion mentioned in imaging findings is not referred in the imaging diagnosis.<break/>Insertion: Unintentional inclusion of incorrect terms or phrases within imaging findings or radiological diagnosis, such as improper word substitutions or insertions.<break/>Spelling errors: Spelling mistakes caused by rapid typing of chinese characters, such as substitutions of homophones that may alter the intended meaning.<break/>Positional discrepancies: Inconsistencies between imaging descriptions and diagnosis regarding anatomical locations, such as confusion between right and left or medial and lateral positions.<break/>Other errors: Errors not falling into the above categories, including inaccuracies in report comparison dates, measurement units, punctuation, etc<break/>Please review each report provided below one by one to check for errors. Output your findings in the following format: &#x201C;Serial number XXX /n Error categories: &#x2212;1-, &#x2212;2-, &#x2026; /n Cause of error: &#x2026;&#x201D;</td></tr><tr><td align="left" valign="top">Few-shot prompt</td><td align="left" valign="top">I am going to provide you with six example reports: one error-free report and five reports containing the various categories of errors listed. The sole purpose of these example reports is to improve your comprehension and to help you recognize the various error categories mentioned in the subsequent tasks.<break/>We then entered each report and its corresponding error description and categorization in order. For example, one example report contained &#x201C;Side confusion,&#x201D; specifically, the radiology description showed a small, well-defined osseous density fragment in the left fifth finger, whereas the radiology impression described small osseous density adjacent to the right fifth middle phalanx, suspicious for an avulsion fracture. At the end of the learning phase containing all 6 example reports, we entered &#x201C;provided example reports completed&#x201D; to indicate the end of the examples. We then followed the prompts mentioned in the 0-shot setting to begin error detection on the test set reports.</td></tr></tbody></table></table-wrap></sec><sec id="s2-3"><title>Study Design</title><p>This study evaluated the capabilities of different LLMs and radiologists in detecting errors in emergency radiology reports across diverse scenarios through a 4-stage experimental design (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow of the multistage validation study. Schematic diagram of the 4-stage experimental design. (<bold>A</bold>) Initial screening of 5 large language models using 200 reports (100 error-free; 100 with 127 synthetic errors across 5 categories). (<bold>B</bold>) Few-shot optimization of top-performing models (DeepSeek-R1) using clinical examples. (<bold>C</bold>) Benchmarking against varying experience of radiologists (seniors, attendings, and residents) in the 0 and few-shot settings on subdataset 1 and subdataset 2. (<bold>D</bold>) Real-world validation on 800 verified emergency reports. CT: computed tomography; DR: digital radiography; LLM: large language model; MRI: magnetic resonance imaging.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86841_fig01.png"/></fig></sec><sec id="s2-4"><title>Stage 1: Initial Model Screening</title><p>First, 200 error-free reports were randomly selected from dataset 1 using a freeware data randomization tool [<xref ref-type="bibr" rid="ref20">20</xref>], and then proofread by human experts to ensure accuracy. We evaluated 5 LLMs (open-source: DeepSeek-R1, DeepSeek-V3, and Qwen3-235B; closed-source: GPT-4o and Grok3) using 0-shot prompts to rank their performance in detecting errors in emergency radiology reports (200 cases), with the aim of determining the 2 best-performing models for subsequent analysis.</p></sec><sec id="s2-5"><title>Stage 2: Few-Shot Prompt Evaluation</title><p>The 2 top-performing LLMs from stage 1 were re-evaluated in 0-shot and few-shot settings using a test set (100 reports and 64 errors), which was composed by randomly selecting 50 correct reports and 50 artificial-error subgroup reports. This phase explored whether the error detection capabilities of the optimal model improve in a few-shot setting and determined the best-performing model for stage 3 and stage 4 analysis.</p></sec><sec id="s2-6"><title>Stage 3: Exploratory Analysis of DeepSeek-R1 Performance</title><p>Using the top-performance model (DeepSeek-R1) from stage 2 and using the same test set as stage 2 (100 reports and 64 errors), the error detection performance and time taken by 12 radiologists of different experience levels (4 senior radiologists, 4 attending radiologists, and 4 residents) were assessed via a customized online survey platform [<xref ref-type="bibr" rid="ref24">24</xref>]. Experts were stratified into 2 groups (6 experts per group) and each group independently evaluated the 100 reports, benchmarking against the LLM&#x2019;s performance under simulated 0-shot and few-shot workflows. In addition, under the few-shot workflow, further testing was conducted on dataset 2 (100 reports containing 60 errors) to compare the error detection performance between DeepSeek-R1 and 3 radiologists with different experience levels (1 senior radiologist, 1 attending radiologist, and 1 resident). Stages 2 and 3 were intentionally designed as stress-testing phases for boundary performance, in which error prevalence was artificially enriched to enable controlled and statistically efficient comparison of error detection sensitivity across models and human readers.</p></sec><sec id="s2-7"><title>Stage 4: Validation on a Real-World Dataset</title><p>Using the few-shot model from stage 2, DeepSeek-R1 processed 800 unverified real emergency radiology reports (400 CT, 200 MR, and 200 radiograph reports), with outputs compared to final reviewed versions. Reports were then assessed by 2 senior radiologists to determine if LLM outputs could influence final diagnoses and sorted as either true errors or false-positive results. These assessments quantified how AI assistance might impact false-positive generation. The clinical impact of false-positive responses was further assessed to determine their actual harmfulness or potential benefits, with the detailed criteria outlined in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-8"><title>Statistical Analysis</title><p>All emergency report evaluation results were collected and analyzed using R (version 4.2.3; R Foundation) software. We measured the number of correctly identified errors, mean processing time, and associated costs. The average detection time and associated costs per report were calculated from evaluations of 20 randomly selected emergency radiology reports. Only artificially introduced errors during data processing were included in the measurements. To ensure the reliability of our findings, all reports in the dataset and their corresponding outputs were reviewed by 3 reviewers (HS, TW, and BZ). Detailed methods for error quantification and analytical procedures are provided in the text and Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>We compared the number of correctly identified errors and mean processing time between LLMs and radiologists using metrics such as positive predictive value (PPV), true positive rate (TPR), false positive report rate (FPRR), and <italic>F</italic><sub>1</sub>-score. We computed CIs using the Wilson score method [<xref ref-type="bibr" rid="ref25">25</xref>]. To compare performance metrics between radiologists and LLMs across 0-shot and few-shot settings, Wald chi-square tests were used to evaluate differences in error detection metrics (PPV, TPR, and <italic>F</italic><sub>1</sub>-scores) and FPRR. Bonferroni correction adjusted for multiple comparisons, with a 2-tailed <italic>P</italic> value &#x003C;.05 considered statistically significant. Cohen &#x03BA; assessed agreement between model predictions and individual readers, while the intraclass correlation coefficient measured interrater reliability among radiologists. Interreader reliability was assessed using Cohen &#x03BA;, with the following classification: 0.01&#x2010;0.20 (negligible to slight), 0.21&#x2010;0.40 (fair), 0.41&#x2010;0.60 (moderate), 0.61&#x2010;0.80 (substantial), and 0.81&#x2010;1.00 (nearly perfect) [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Stage 1: Performance in Detecting Errors Among LLMs</title><p>In stage 1 (200 reports and 127 errors), DeepSeek-R1 achieved the optimal performance, with a detection rate of 51.2%. For error detection, the PPV, TPR, and <italic>F</italic><sub>1</sub>-score were 64.4% (95% CI 55%&#x2010;74%), 51.2% (95% CI 42.3%&#x2010;60%), and 57% (95% CI 49%&#x2010;60%), respectively. Grok3 was the second-best performing model, with a detection rate of 48% (61/127). For error detection, the PPV, TPR, and <italic>F</italic><sub>1</sub>-score were 64.9% (95% CI 54.9%&#x2010;74.7%), 48% (95% CI 39.5%&#x2010;56.9%), and 55.2% (95% CI 47.1&#x2010;62.8), respectively. In contrast, DeepSeek-V3 yielded a detection rate of 37% (47/127), with PPV and TPR of 54% (95% CI 43.4%&#x2010;64.4%) and 37% (95% CI 28.8%&#x2010;45.7%), respectively. Additionally, Qwen3 and GPT-4o achieved a detection rate of only 33.1% (42/127), with TPR of 33.1% (95% CI 24.8%&#x2010;41.2%), while with PPV of 48.3% (95% CI 37.3%&#x2010;58.8%) and 50% (95% CI 39.2%&#x2010;60.8%), <italic>F</italic><sub>1</sub>-score of 39.3% (95% CI 30.7%&#x2010;47.2%) and 39.8% (95% CI 31.1%&#x2010;47.8%), respectively (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Performance of different LLMs<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> in error detection in a 0-shot setting. Bonferroni correction was used to correct <italic>P</italic> values for multiple comparisons with DeepSeek-R1. Higher values of detection rate, PPV<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>, and TPR<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> indicate better detection performance of the model, while a higher FPRR<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> value suggests poorer detection performance.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="2">Detection rate (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">PPV (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">TPR (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">FPRR (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top">DeepSeek-R1</td><td align="left" valign="top">51.2 (40.9-62.2)</td><td align="left" valign="top">65/127</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">64.4 (55.0-74.0)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">51.2 (42.3-60.0)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">57.0 (49.0-64.5)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">16.5 (11.5-22.0)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Grok3</td><td align="left" valign="top">48.0 (37.8-58.3)</td><td align="left" valign="top">61/127</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">64.9 (54.9-74.7)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">48.0 (39.5-56.9)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">55.2 (47.1-62.8)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">16.5 (11.5-22.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">DeepSeek-V3</td><td align="left" valign="top">37.0 (27.6-47.2)</td><td align="left" valign="top">47/127</td><td align="left" valign="top">.09</td><td align="left" valign="top">54.0 (43.4-64.4)</td><td align="left" valign="top">.35</td><td align="left" valign="top">37.0 (28.8-45.7)</td><td align="left" valign="top">.09</td><td align="left" valign="top">43.9 (35.3-52.2)</td><td align="left" valign="top">.19</td><td align="left" valign="top">20.0 (14.5-26.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Qwen3</td><td align="left" valign="top">33.1 (24.4-42.5)</td><td align="left" valign="top">42/127</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">48.3 (37.3-58.8)</td><td align="left" valign="top">.052</td><td align="left" valign="top">33.1 (25.2-41.4)</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">39.3 (30.7-47.2)</td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">22.5 (17.0-28.5)</td><td align="left" valign="top">.52</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">33.1 (24.4-42.5)</td><td align="left" valign="top">42/127</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">50.0 (39.2-60.8)</td><td align="left" valign="top">.10</td><td align="left" valign="top">33.1 (24.8-41.2)</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">39.8 (31.1-47.8)</td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup></td><td align="left" valign="top">21.0 (15.5-27.0)</td><td align="left" valign="top">&#x2265;.99</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>LLMs: large language models.</p></fn><fn id="table3fn2"><p><sup>b</sup>PPV: positive predictive value.</p></fn><fn id="table3fn3"><p><sup>c</sup>TPR: true positive rate.</p></fn><fn id="table3fn4"><p><sup>d</sup>FPRR: false positive report rate.</p></fn><fn id="table3fn5"><p><sup>e</sup>Not available.</p></fn><fn id="table3fn6"><p><sup>f</sup><italic>P</italic> &#x003C; 0.05 was considered statistically significant.</p></fn></table-wrap-foot></table-wrap><p>Regarding the negative impact, the false positives generated by DeepSeek-R1 and Grok3 were comparable. For example, the FPRR for DeepSeek-R1 and grok3 both was 16.5% (95% CI 11.5%&#x2010;22%). In contrast, DeepSeek-V3, Qwen3, and GPT-4o generated significantly more false positives, with an FPRR of 20% (95% CI 14.5%&#x2010;26%), 22.5% (95% CI 17%&#x2010;28.5%), and 21% (95% CI 15.5%&#x2010;27%), respectively, with no statistically significant differences (<italic>P</italic>=.52 to &#x2265;.99; <xref ref-type="table" rid="table3">Table 3</xref>).</p></sec><sec id="s3-2"><title>Stages 2 and 3: Exploratory Analysis of DeepSeek-R1 Performance: Overall Performance in Detecting Errors on Subdataset 1 (100 Reports With 64 Errors)</title><p>In stage 2, we compared the performance of DeepSeek-R1 and Grok3 under both 0-shot and few-shot settings against radiologists with varying experience levels on dataset 1 (100 reports with 64 errors). Both models showed a higher error detection rate in the few-shot setting compared to the 0-shot setting. It should be noted that these detection rates reflect sensitivity under controlled, error-enriched conditions and are not intended to represent expected detection performance in routine clinical practice. Specifically, DeepSeek-R1&#x2019;s detection rate increased significantly from 60.9% (39/64) to 84.4% (54/64; <italic>P</italic>=.003), whereas Grok3&#x2019;s improvement was not statistically significant, rising from 53.1% (34/64) to 56.3% (36/64; <italic>P</italic>=.72). A significant performance difference was observed between DeepSeek-R1 and Grok3 in the few-shot setting (<italic>P</italic>&#x003C;.001), but not in the 0-shot setting (<italic>P</italic>=.38).</p><p>In the 0-shot error detection task, DeepSeek-R1&#x2019;s detection rate (60.9%) was significantly higher than that of resident 2 (60.9% vs 35.9%, <italic>P</italic>&#x003C;.05). However, there was no evidence of a difference in the percentage of detected errors between DeepSeek-R1 and the other radiologists per report (<italic>P</italic> value range, .15 to &#x2265;.99). Under the few-shot learning setting, DeepSeek-R1 achieved a detection rate of 84.4% (54/64), surpassing the few-shot performance of resident radiologists. However, there was no evidence of a difference in the percentage of detected errors between DeepSeek-R1 and the other radiologists per report (<italic>P</italic> value range, .26 to &#x2265;.99). Detailed performance metrics for LLMs and radiologists are provided in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of error detection between LLMs<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> and the radiologists in 0-shot and few-shot settings. Bonferroni correction was used to correct <italic>P</italic> values for multiple comparisons.<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom" colspan="2">Detection rate (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">PPV<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">TPR<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup> (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">FPRR<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup> (%), 95% CI</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top">0-shot</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeepSeek-R1</td><td align="left" valign="top">60.9 (47.9-72.6)</td><td align="left" valign="top">39/64</td><td align="left" valign="top">Ref.<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">90.7 (81.3-97.9)</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">60.9 (49.2-72.6)</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">72.9 (63.0-81.7)</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">4.0 (1.0-8.0)</td><td align="left" valign="top">Ref.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Grok3</td><td align="left" valign="top">53.1 (40.3-65.6)</td><td align="left" valign="top">34/64</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">79.1 (63.5-89.4)</td><td align="left" valign="top">.89</td><td align="left" valign="top">53.1 (40.7-65.0)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">63.6 (52.0-73.4)</td><td align="left" valign="top">.90</td><td align="left" valign="top">9.0 (4.0-15.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 1</td><td align="left" valign="top">73.4 (60.7-83.4)</td><td align="left" valign="top">47/64</td><td align="left" valign="top">.92</td><td align="left" valign="top">90.4 (81.5-97.8)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">73.4 (60.7-83.4)</td><td align="left" valign="top">.90</td><td align="left" valign="top">81.0 (72.2-88.3)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">5.0 (1.0-10.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 2</td><td align="left" valign="top">71.9 (59.1-82.1)</td><td align="left" valign="top">46/64</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">93.9 (86.3-100.0)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">71.9 (60.3-82.8)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">81.4 (72.7-88.9)</td><td align="left" valign="top">.87</td><td align="left" valign="top">3.0 (0.0-7.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 1</td><td align="left" valign="top">68.8 (55.8-79.2)</td><td align="left" valign="top">44/64</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">86.3 (75.9-94.8)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">68.8 (56.9-79.7)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">76.5 (67.2-84.4)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">7.0 (3.0-13.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 2</td><td align="left" valign="top">65.6 (52.6-76.8)</td><td align="left" valign="top">42/64</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">84.0 (73.5-93.6)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">65.6 (53.5-76.8)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">73.7 (63.5-82.1)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">8.0 (3.0-13.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 1</td><td align="left" valign="top">40.6 (28.8-53.6)</td><td align="left" valign="top">26/64</td><td align="left" valign="top">.15</td><td align="left" valign="top">92.9 (81.8-100.0)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">40.6 (29.0-53.1)</td><td align="left" valign="top">.13</td><td align="left" valign="top">56.5 (43.8-68.0)</td><td align="left" valign="top">.06</td><td align="left" valign="top">2.0 (0.0-5.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 2</td><td align="left" valign="top">35.9 (24.6-49.0)</td><td align="left" valign="top">23/64</td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">85.2 (70.4-96.8)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">35.9 (24.6-48.3)</td><td align="left" valign="top">.021<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">50.5 (37.2-62.7)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">4.0 (1.0-8.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top">Few-shot</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeepSeek-R1</td><td align="left" valign="top">84.4 (72.7-91.9)</td><td align="left" valign="top">54/64</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">91.5 (83.6-98.2)</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">84.4 (75.0-92.6)</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">87.8 (81.1-93.3)</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">5.0 (1.0-10.0)</td><td align="left" valign="top">Ref.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Grok3</td><td align="left" valign="top">56.3 (43.3-68.4)</td><td align="left" valign="top">36/64</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">82.9 (70.3-93.9)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">56.3 (42.2-66.7)</td><td align="left" valign="top">.001</td><td align="left" valign="top">66.0 (54.2-75.9)</td><td align="left" valign="top">&#x003C;.001<bold><sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></bold></td><td align="left" valign="top">7.0 (2.0-12.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 3</td><td align="left" valign="top">93.8 (84.0-98.0)</td><td align="left" valign="top">60/64</td><td align="left" valign="top">.63</td><td align="left" valign="top">96.8 (91.5-100.0)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">93.8 (87.3-98.6)</td><td align="left" valign="top">.60</td><td align="left" valign="top">95.2 (91.1-98.5)</td><td align="left" valign="top">.30</td><td align="left" valign="top">2.0 (0.0-5.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 4</td><td align="left" valign="top">84.4 (72.7-91.9)</td><td align="left" valign="top">54/64</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">93.1 (85.7-98.5)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">84.4 (75.4-93.0)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">88.5 (82.1-94.1)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">4.0 (1.0-8.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 3</td><td align="left" valign="top">70.3 (57.4-80.8)</td><td align="left" valign="top">45/64</td><td align="left" valign="top">.40</td><td align="left" valign="top">91.8 (83.3-98.1)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">70.3 (58.6-81.3)</td><td align="left" valign="top">.38</td><td align="left" valign="top">79.6 (70.7-87.2)</td><td align="left" valign="top">.65</td><td align="left" valign="top">4.0 (1.0-8.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 4</td><td align="left" valign="top">68.8 (55.8-79.4)</td><td align="left" valign="top">44/64</td><td align="left" valign="top">.26</td><td align="left" valign="top">91.7 (83.0-98.1)</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">68.8 (57.6-79.7)</td><td align="left" valign="top">.27</td><td align="left" valign="top">78.6 (69.8-86.4)</td><td align="left" valign="top">.42</td><td align="left" valign="top">4.0 (1.0-8.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 3</td><td align="left" valign="top">53.1 (40.3-65.6)</td><td align="left" valign="top">34/64</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">89.5 (78.4-97.6)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">53.1 (40.9-65.5)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">66.7 (55.2-76.5)</td><td align="left" valign="top">.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">4.0 (1.0-8.0)</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 4</td><td align="left" valign="top">51.6 (38.8-64.1)</td><td align="left" valign="top">33/64</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">86.8 (75.0-97.1)</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">51.6 (39.7-63.8)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">64.7 (53.3-74.8)</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">5.0 (1.0-10.0)</td><td align="left" valign="top">&#x2265;.99</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table4fn2"><p><sup>b</sup>The performance of DeepSeek-R1 in detecting errors was compared with that of Grok3 and radiologists using Wald chi-square tests. Higher values of detection rate, positive predictive value, and true positive rate indicate better detection performance of the model, while a higher false positive report rate value suggests poorer detection performance.</p></fn><fn id="table4fn3"><p><sup>c</sup>PPV: positive predictive value.</p></fn><fn id="table4fn4"><p><sup>d</sup>TPR: true positive rate.</p></fn><fn id="table4fn5"><p><sup>e</sup>FPRR: false positive report rate.</p></fn><fn id="table4fn6"><p><sup>f</sup>Ref.: Reference.</p></fn><fn id="table4fn7"><p><sup>g</sup><italic>P</italic> &#x003C; 0.05 was considered statistically significant.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Performance in Detecting Errors by Error Type on Subdataset 1 (100 Reports With 64 Errors)</title><p>In the 0-shot setting, DeepSeek-R1 showed superior performance over 2 residents in detecting side confusion, with a detection rate of 94% (95% CI 71&#x2010;100) compared to 39% (95% CI 18&#x2010;64) and 28% (95% CI 11&#x2010;54; <italic>P</italic>=.003 and &#x003C;.001, respectively; <xref ref-type="fig" rid="figure2">Figure 2A</xref> and <xref ref-type="table" rid="table5">Table 5</xref>). In addition, DeepSeek-R1 detected other types of errors more frequently than resident 1 (67%, 95% CI 35&#x2010;89 vs 8%, 95% CI 1&#x2010;40; <italic>P</italic>=.022). However, no significant differences were observed between DeepSeek-R1 and the other radiologists in the overall error detection rate per report (<italic>P</italic> values range from .091 to &#x003E;.99).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Performance comparison for critical error types. Bar graphs comparing proofreading performance between human readers and large language models across error types. (<bold>A</bold>) Detection rates in a 0-shot setting. (<bold>B</bold>) Detection rates in a few-shot setting.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86841_fig02.png"/></fig><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Comparison of detection rates for different error types in radiology reports in 0-shot and few-shot settings. Other errors included errors that did not fit into the defined categories, such as incorrect date entries, errors in numbering of images and/or series, and mistakes in specifying units of measurement (eg, centimeter vs millimeter). Bonferroni correction was used to correct <italic>P</italic> values for multiple comparisons.<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup></p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Omission</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom" colspan="2">Insertion</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom" colspan="2">Spelling</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom" colspan="2">Side confusion</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom" colspan="2">Other</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="16">0-shot</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeepSeek-R1</td><td align="left" valign="top">25 (7-57)</td><td align="char" char="." valign="top">3/12</td><td align="left" valign="top">Ref.<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup></td><td align="left" valign="top">25 (45-64)</td><td align="char" char="." valign="top">2/8</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">64 (36-86)</td><td align="char" char="." valign="top">9/14</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">94 (71-100)</td><td align="char" char="." valign="top">17/18</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">Ref.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Grok3</td><td align="left" valign="top">50 (22-77)</td><td align="char" char="." valign="top">6/12</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">38 (10-74)</td><td align="char" char="." valign="top">3/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">43 (19-70)</td><td align="char" char="." valign="top">6/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">61 (36-82)</td><td align="char" char="." valign="top">11/18</td><td align="left" valign="top">.11</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 1</td><td align="left" valign="top">75 (43-93)</td><td align="char" char="." valign="top">9/12</td><td align="left" valign="top">.10</td><td align="left" valign="top">75 (36-96)</td><td align="char" char="." valign="top">6/8</td><td align="left" valign="top">.32</td><td align="left" valign="top">64 (36-86)</td><td align="char" char="." valign="top">9/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">83 (58-96)</td><td align="char" char="." valign="top">15/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 2</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">.28</td><td align="left" valign="top">88 (47-99)</td><td align="char" char="." valign="top">7/8</td><td align="left" valign="top">.08</td><td align="left" valign="top">57 (30-81)</td><td align="char" char="." valign="top">8/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">78 (52-93)</td><td align="char" char="." valign="top">14/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">75 (43-93)</td><td align="char" char="." valign="top">9/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 1</td><td align="left" valign="top">75 (43-93)</td><td align="char" char="." valign="top">9/12</td><td align="left" valign="top">.10</td><td align="left" valign="top">63 (26-90)</td><td align="char" char="." valign="top">5/8</td><td align="left" valign="top">.91</td><td align="left" valign="top">57 (30-81)</td><td align="char" char="." valign="top">8/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">61 (36-82)</td><td align="char" char="." valign="top">11/18</td><td align="left" valign="top">.11</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 2</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">.28</td><td align="left" valign="top">50 (17-83)</td><td align="char" char="." valign="top">4/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">71 (42-90)</td><td align="char" char="." valign="top">10/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">78 (52-93)</td><td align="char" char="." valign="top">14/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">50 (22-78)</td><td align="char" char="." valign="top">6/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 1</td><td align="left" valign="top">42 (19-68)</td><td align="char" char="." valign="top">5/12</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">50 (17, 83)</td><td align="char" char="." valign="top">4/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">64 (36-86)</td><td align="char" char="." valign="top">9/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">39 (18-64)</td><td align="char" char="." valign="top">7/18</td><td align="left" valign="top">.003<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">8 (1-40)</td><td align="char" char="." valign="top">1/12</td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 2</td><td align="left" valign="top">42 (19-68)</td><td align="char" char="." valign="top">5/12</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">50 (17-83)</td><td align="char" char="." valign="top">4/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">50 (24-76)</td><td align="char" char="." valign="top">7/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">28 (11-54)</td><td align="char" char="." valign="top">5/18</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">17 (3-50)</td><td align="char" char="." valign="top">2/12</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top" colspan="16">Few-shot</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeepSeek-R1</td><td align="left" valign="top">100 (70-100)</td><td align="char" char="." valign="top">12/12</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">50 (22-77)</td><td align="char" char="." valign="top">4/8</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">79 (49-94)</td><td align="char" char="." valign="top">11/14</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">89 (64-98)</td><td align="char" char="." valign="top">16/18</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">92 (60-100)</td><td align="char" char="." valign="top">11/12</td><td align="left" valign="top">Ref.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Grok3</td><td align="left" valign="top">75 (43-93)</td><td align="char" char="." valign="top">9/12</td><td align="left" valign="top">.45</td><td align="left" valign="top">50 (22-77)</td><td align="char" char="." valign="top">4/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">36 (14-64)</td><td align="char" char="." valign="top">5/14</td><td align="left" valign="top">.15</td><td align="left" valign="top">78 (52-93)</td><td align="char" char="." valign="top">14/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">33 (11-65)</td><td align="char" char="." valign="top">4/12</td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 3</td><td align="left" valign="top">92 (60-99)</td><td align="char" char="." valign="top">11/12</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">100 (68-100)</td><td align="char" char="." valign="top">8/8</td><td align="left" valign="top">.15</td><td align="left" valign="top">100 (73-100)</td><td align="char" char="." valign="top">14/14</td><td align="left" valign="top">.47</td><td align="left" valign="top">94 (71-100)</td><td align="char" char="." valign="top">17/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">83 (51-97)</td><td align="char" char="." valign="top">10/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 4</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">.20</td><td align="left" valign="top">88 (47-99)</td><td align="char" char="." valign="top">7/8</td><td align="left" valign="top">.74</td><td align="left" valign="top">86 (56-97)</td><td align="char" char="." valign="top">12/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">94 (71-100)</td><td align="char" char="." valign="top">17/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">83 (51-97)</td><td align="char" char="." valign="top">10/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 3</td><td align="left" valign="top">67 (35-89)</td><td align="char" char="." valign="top">8/12</td><td align="left" valign="top">.20</td><td align="left" valign="top">50 (17-83)</td><td align="char" char="." valign="top">4/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">57 (30-81)</td><td align="char" char="." valign="top">8/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">100 (78-100)</td><td align="char" char="." valign="top">18/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">58 (29-84)</td><td align="char" char="." valign="top">7/12</td><td align="left" valign="top">.42</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 4</td><td align="left" valign="top">58 (29-84)</td><td align="char" char="." valign="top">7/12</td><td align="left" valign="top">.08</td><td align="left" valign="top">75 (36-96)</td><td align="char" char="." valign="top">6/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">71 (42-90)</td><td align="char" char="." valign="top">10/14</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">61 (36-82)</td><td align="char" char="." valign="top">11/18</td><td align="left" valign="top">.38</td><td align="left" valign="top">83 (51-97)</td><td align="char" char="." valign="top">10/12</td><td align="left" valign="top">&#x2265;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 3</td><td align="left" valign="top">50 (22-77)</td><td align="char" char="." valign="top">6/12</td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">25 (45-64)</td><td align="char" char="." valign="top">2/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">50 (24-76)</td><td align="char" char="." valign="top">7/14</td><td align="left" valign="top">.80</td><td align="left" valign="top">83 (58-96)</td><td align="char" char="." valign="top">15/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">33 (11-65)</td><td align="char" char="." valign="top">4/12</td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 4</td><td align="left" valign="top">42 (19-68)</td><td align="char" char="." valign="top">5/12</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td><td align="left" valign="top">38 (10-74)</td><td align="char" char="." valign="top">3/8</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">50 (24-76)</td><td align="char" char="." valign="top">7/14</td><td align="left" valign="top">.80</td><td align="left" valign="top">78 (52-93)</td><td align="char" char="." valign="top">14/18</td><td align="left" valign="top">&#x2265;.99</td><td align="left" valign="top">33 (11-65)</td><td align="char" char="." valign="top">4/12</td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>The number of errors correctly detected by DeepSeek-R1 was compared with that of Grok3 and radiologists by using Wald chi-square tests.</p></fn><fn id="table5fn2"><p><sup>b</sup>Ref.: Reference.</p></fn><fn id="table5fn3"><p><sup>c</sup><italic>P</italic>&#x003C;.05 was considered statistically significant.</p></fn></table-wrap-foot></table-wrap><p>In the few-shot setting, DeepSeek-R1 outperformed resident 3 in detecting both omission errors (detection rate, 100%, 95% CI 70&#x2010;100 vs 50%, 95% CI 22-77 and 92%, 95% CI 60&#x2010;100 vs 33%, 95% CI 11&#x2010;65; <italic>P</italic>=.03 and <italic>P</italic>=.02, respectively) and resident 4 (100%, 95% CI 70&#x2010;100 vs 25%, 95% CI 7&#x2010;57 and 92%, 95% CI 60&#x2010;100 vs 33%, 95% CI 11&#x2010;65; <italic>P</italic>=.01 and <italic>P</italic>=.02, respectively). However, given the limited sample size, the subgroup analyses were likely underpowered, and the resulting comparisons should be viewed as exploratory and interpreted with caution. More results are displayed in <xref ref-type="fig" rid="figure2">Figure 2B</xref> and <xref ref-type="table" rid="table5">Table 5</xref>.</p></sec><sec id="s3-4"><title>Performance in Detecting Errors by Imaging Modality on Subdataset 1 (100 Reports With 64 Errors)</title><p>In the 0-shot learning scenario, no significant difference was observed in error detection between DeepSeek-R1 and the radiologists for either radiography reports (detection rate: 66.7% vs 41.9%&#x2010;87.1%; <italic>P</italic> value range: .29 to &#x003E;.99) or CT/MRI reports (45.5% vs 27.3%&#x2010;69.7%; <italic>P</italic> value range: .17 to &#x003E;.99).</p><p>In the few-shot learning scenario, DeepSeek-R1 detected significantly more errors than the lowest-performing resident radiologist across both radiography and CT/MRI reports. For radiography reports, the detection rate was 90.3% (95% CI 73.1&#x2010;97.4) compared to 54.8% (95% CI 36.3&#x2010;72.2; <italic>P</italic>=.01). For CT/MRI reports, DeepSeek-R1 achieved a detection rate of 78.8% (95% CI 60.6&#x2010;90.4) versus 45.4% (95% CI 28.5&#x2010;63.4; <italic>P</italic>=.04). However, no significant differences were observed between DeepSeek-R1 and the other radiologists in error detection rates per report (<italic>P</italic> values ranging from .053 to &#x003E;.99 for radiography and .07 to &#x003E;.99 for CT/MRI). Additional results are provided in <xref ref-type="table" rid="table6">Table 6</xref>.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Subgroup analyses of imaging modalities.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Total</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom" colspan="2">Radiography</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom" colspan="2">CT<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup> and MRI<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/><td align="left" valign="top">Detection rate (%), 95% CI</td><td align="left" valign="top">Values, n/N</td><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="10">0-shot</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeepSeek-R1</td><td align="left" valign="top">60.9 (47.9-72.6)</td><td align="left" valign="top">39/64</td><td align="left" valign="top">Ref.<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup></td><td align="left" valign="top">66.7 (45.4-80.2)</td><td align="left" valign="top">21/31</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">54.5 (38.0-70.2)</td><td align="left" valign="top">18/33</td><td align="left" valign="top">Ref.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Grok3</td><td align="left" valign="top">53.1 (40.3-65.6)</td><td align="left" valign="top">34/64</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">64.5 (46.9-78.9)</td><td align="left" valign="top">20/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">42.4 (27.2-59.2)</td><td align="left" valign="top">14/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 1</td><td align="left" valign="top">73.4 (60.7-83.4)</td><td align="left" valign="top">47/64</td><td align="left" valign="top">.92</td><td align="left" valign="top">87.1 (69.2-95.8)</td><td align="left" valign="top">27/31</td><td align="left" valign="top">.48</td><td align="left" valign="top">60.6 (36.6-71.5)</td><td align="left" valign="top">20/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 2</td><td align="left" valign="top">71.9 (59.1-82.1)</td><td align="left" valign="top">46/64</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">74.2 (55.1-87.5)</td><td align="left" valign="top">23/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">69.7 (51.1-83.8)</td><td align="left" valign="top">23/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 1</td><td align="left" valign="top">68.8 (55.8-79.2)</td><td align="left" valign="top">44/64</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">77.4 (58.5-89.7)</td><td align="left" valign="top">24/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">60.6 (42.2-76.6)</td><td align="left" valign="top">20/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 2</td><td align="left" valign="top">65.6 (52.6-76.8)</td><td align="left" valign="top">42/64</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">74.2 (55.4-87.5)</td><td align="left" valign="top">23/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">57.6 (39.4-74.1)</td><td align="left" valign="top">19/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 1</td><td align="left" valign="top">40.6 (28.8-53.6)</td><td align="left" valign="top">26/64</td><td align="left" valign="top">.15</td><td align="left" valign="top">41.9 (52.1-60.7)</td><td align="left" valign="top">13/31</td><td align="left" valign="top">.29</td><td align="left" valign="top">33.3 (18.6-51.9)</td><td align="left" valign="top">11/33</td><td align="left" valign="top">.58</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 2</td><td align="left" valign="top">35.9 (24.6-49.0)</td><td align="left" valign="top">23/64</td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">45.2 (27.8-63.7)</td><td align="left" valign="top">14/31</td><td align="left" valign="top">.51</td><td align="left" valign="top">27.3 (13.9-45.8)</td><td align="left" valign="top">9/33</td><td align="left" valign="top">.17</td></tr><tr><td align="left" valign="top" colspan="10">Few-shot</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>DeepSeek-R1</td><td align="left" valign="top">84.4 (72.7-91.9)</td><td align="left" valign="top">54/64</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">90.3 (73.1-97.4)</td><td align="left" valign="top">28/31</td><td align="left" valign="top">Ref.</td><td align="left" valign="top">78.8 (60.6-90.4)</td><td align="left" valign="top">26/33</td><td align="left" valign="top">Ref.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Grok3</td><td align="left" valign="top">56.3 (43.3-68.4)</td><td align="left" valign="top">36/64</td><td align="left" valign="top">.003<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">66.7 (45.8-82.7)</td><td align="left" valign="top">21/31</td><td align="left" valign="top">.20</td><td align="left" valign="top">45.4 (28.5-63.4)</td><td align="left" valign="top">15/33</td><td align="left" valign="top">.04<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 3</td><td align="left" valign="top">93.8 (84.0-98.0)</td><td align="left" valign="top">60/64</td><td align="left" valign="top">.63</td><td align="left" valign="top">93.5 (77.2-98.9)</td><td align="left" valign="top">29/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">93.9 (78.4-98.9)</td><td align="left" valign="top">31/33</td><td align="left" valign="top">.51</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Senior 4</td><td align="left" valign="top">84.4 (72.7-91.9)</td><td align="left" valign="top">54/64</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">93.5 (77.2-99.0)</td><td align="left" valign="top">29/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">75.8 (57.4-88.3)</td><td align="left" valign="top">25/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 3</td><td align="left" valign="top">70.3 (57.4-80.8)</td><td align="left" valign="top">45/64</td><td align="left" valign="top">.40</td><td align="left" valign="top">77.4 (58.5-89.7)</td><td align="left" valign="top">24/31</td><td align="left" valign="top">&#x003E;.99</td><td align="left" valign="top">63.6 (45.2-79.0)</td><td align="left" valign="top">21/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending 4</td><td align="left" valign="top">68.8 (55.8-79.4)</td><td align="left" valign="top">44/64</td><td align="left" valign="top">.26</td><td align="left" valign="top">67.7 (48.5-82.7)</td><td align="left" valign="top">21/31</td><td align="left" valign="top">.20</td><td align="left" valign="top">69.7 (51.1-83.8)</td><td align="left" valign="top">23/33</td><td align="left" valign="top">&#x003E;.99</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 3</td><td align="left" valign="top">53.1 (40.3-65.6)</td><td align="left" valign="top">34/64</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">61.3 (42.3-77.6)</td><td align="left" valign="top">19/31</td><td align="left" valign="top">.053</td><td align="left" valign="top">45.4 (28.5-63.4)</td><td align="left" valign="top">15/33</td><td align="left" valign="top">.04<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Resident 4</td><td align="left" valign="top">51.6 (38.8-64.1)</td><td align="left" valign="top">33/64</td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">54.8 (36.3-72.2)</td><td align="left" valign="top">17/31</td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup></td><td align="left" valign="top">48.5 (31.2-66.1)</td><td align="left" valign="top">16/33</td><td align="left" valign="top">.07</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>CT: computed tomography.</p></fn><fn id="table6fn2"><p><sup>b</sup>MRI: magnetic resonance imaging. </p></fn><fn id="table6fn3"><p><sup>c</sup>Ref.: Reference.</p></fn><fn id="table6fn4"><p><sup>d</sup><italic>P</italic>&#x003C;.05 was considered statistically significant.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-5"><title>Performance in Detecting Errors on Dataset 2 (100 Reports With 60 Errors)</title><p>In the dataset 2 under the few-shot setting, DeepSeek-R1 achieved a detection rate of 95%, which was significantly higher than that of resident 5 (61.7%, <italic>P</italic>&#x003C;.001) and attending radiologist 5 (71.7%, <italic>P</italic>=.002). These results suggest that DeepSeek-R1&#x2019;s performance is consistent across different institutions and report structures.</p></sec><sec id="s3-6"><title>Stage 4: Validation on a Real-World Dataset</title><p>Among the 800 reports analyzed, Deepseek-R1 classified 207 reports as having errors, among which 117 were true errors, yielding a PPV of 0.565. The distribution of error types in the real-world dataset showed that omission was the most common error (n=56), whereas incorrect laterality was the least common (n=6). Furthermore, Deepseek-R1 successfully detected at least one instance of every error category.</p><p>The clinical impact of false-positive results incorrectly classified by Deepseek-R1 was further evaluated. There were no reports in which the model introduced errors based on content not present in the original report. Most serious false-positive results involved the addition of clinically insignificant observations to the impression section. Importantly, these outputs did not involve hallucinated or fabricated findings but instead reflected oversensitivity, such as suggestions to include clinically low-impact observations. This pattern of over-alerting, rather than content invention, is likely to preserve radiologist trust in the system. Moreover, approximately 19% of false-positive alerts were considered potentially beneficial for improving report clarity or completeness, thereby mitigating the negative psychological impact typically associated with false alarms.</p><p>In the analysis of false-positive results, 17 (18.9%) report changes were identified as potentially beneficial. The improvements were as follows: correcting typographical and grammatical errors (n=1); enhancing oversimplified impressions to improve clarity (n=6); elaborating on observations that were mentioned in the impressions but insufficiently detailed in the findings section, leading to more thorough documentation (n=3); and adding clinically relevant details, such as nerve root compressions, air-fluid levels, spinal stenosis, or new observations, that were documented in the findings but omitted from the impressions, thereby improving the completeness of the report (n=7).</p></sec><sec id="s3-7"><title>Interrater Agreement</title><p><xref ref-type="fig" rid="figure3">Figure 3A,B</xref>, Tables S4 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> display the heatmap of Cohen &#x03BA; coefficients illustrating pairwise agreement among readers in detecting errors in radiology reports under the 0-shot and few-shot settings. The analysis involved 12 radiologists (4 seniors, 4 attendings, and 4 residents) and 2 AI models (DeepSeek-R1 and Grok3). The interrater agreement analysis revealed several key patterns. Between the 2 AI systems in the 0-shot and few-shot settings, DeepSeek-R1 and Grok3 exhibited moderate and fair agreement (&#x03BA;=0.45 and 0.29, respectively). Among human raters, agreement ranged from slight to substantial, with the lowest consistency observed between the 2 attendings (&#x03BA;=0.02) under the 0-shot setting and the highest between attending 2 and resident 1 (&#x03BA;=0.66) under the few-shot setting. In terms of human-AI agreement, DeepSeek-R1 showed fair to moderate consistency with radiologists (&#x03BA;=0.42&#x2010;0.57) under the 0-shot setting, while under the few-shot setting, agreement ranged from slight to moderate (&#x03BA;=0.09&#x2010;0.45). This pattern of variability in human interpretations, contrasted with the more consistent AI-human agreement, highlights both the subjective nature of error detection in radiology reports and the potential of AI systems to provide more standardized assessments. The generally low to moderate agreement among all raters (&#x03BA;=0.02&#x2010;0.66) underscores the complexity and inherent subjectivity in radiology report error detection, suggesting that even experienced clinicians may apply different criteria when evaluating report accuracy.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Interrater agreement analysis. Cohen &#x03BA; coefficients for pairwise agreement among readers (large language models and radiologists). (A) Consistency for error types in the 0-shot setting. (B) Consistency for error types in the few-shot setting. Agreement scale: &#x2013;1: complete inconsistency; 0: occasional agreement; 0.01-0.20: slight agreement; 0.21-0.40: fair agreement; 0.41-0.60: moderate agreement; 0.61-0.80: substantial agreement; and 0.81-1.00: almost perfect agreement.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86841_fig03.png"/></fig></sec><sec id="s3-8"><title>Reading Time</title><p>In the task of error detection for 100 reports, LLMs demonstrated significant time efficiency advantages. Grok3 processed 100 reports in 0.34 and 0.26 hours under 0-shot and few-shot settings, respectively. For the same task, DeepSeek-R1 required 2.56 hours in the 0-shot setting and 2.26 hours in the few-shot setting.</p><p>In contrast, radiologists&#x2019; reading times ranged from 3.04 to 5.36 hours. For individual report reading, LLMs exhibited even more pronounced speed: Grok3 required only 12.24 seconds per report in the 0-shot setting, while DeepSeek-R1 needed 92.16 seconds. The fastest and slowest radiologists required 109.44 seconds and 192.96 seconds per report, respectively. In the few-shot setting, LLMs showed a slight decrease in processing time and remained substantially faster than human experts (Figure S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study evaluated the performance of DeepSeek-R1 in detecting errors in Chinese emergency radiology reports. We compared its accuracy and processing efficiency with Grok3 and with radiologists of different experience levels under both 0-shot and few-shot settings. By introducing a real-world, emergency-focused evaluation framework, this work provides methodological insights into AI-assisted quality control in digital health care.</p><p>DeepSeek-R1 demonstrated strong error detection performance in emergency radiology. Under stress-testing conditions, it achieved a detection rate of 84.4% in the few-shot setting, indicating high sensitivity to predefined error types rather than direct real-world detection rates. Notably, its performance exceeded that of resident radiologists in identifying critical errors, supporting its potential clinical value for improving report accuracy and facilitating timely patient management.</p></sec><sec id="s4-2"><title>Comparison With Previous Studies</title><p>Previous studies have shown that LLMs, such as GPT-4, can detect errors in radiology reports. Most of these studies relied on synthetically generated error datasets across imaging modalities, including radiography, CT, and MRI. Our study extends this literature in several important ways.</p><p>First, to our knowledge, this is the first validation of an LLM specifically in the emergency radiology setting. Emergency radiology is a high-risk environment characterized by time pressure, incomplete information, and urgent clinical decision-making, which differs substantially from routine radiologic workflows.</p><p>Second, we tailored the model to Chinese clinical language, particularly to address spelling and semantic ambiguities introduced by Pinyin-based input. This addresses a key limitation in prior cross-lingual applications and is especially relevant given reports of the limited accuracy of models such as Claude 3.5 in Chinese ultrasound reports. Third, by analyzing 7435 real-world clinical reports across CT, MRI, and radiography, our study avoided exclusive reliance on synthetic error injection [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. This approach better captures the complexity and variability of real emergency reporting workflows.</p><p>Together, these findings suggest that DeepSeek-R1 may serve as an efficient and scalable assistant for radiologists, particularly for identifying critical omissions and communication-related errors. Importantly, validation using independent datasets from 2 additional tertiary hospitals with different reporting systems supports the robustness of the proposed approach across institutions and report templates.</p><p>Consistent with prior studies, our results indicate that LLMs can proofread radiology reports at a level comparable to most human readers. Reporting errors occur across all levels of radiologist experience [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], and the observed performance likely reflects routine clinical practice. These findings highlight the potential role of AI in supporting radiology workflows beyond image interpretation alone [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>In terms of efficiency, our results align with previous studies examining the integration of LLMs into radiologic workflows [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. When used as a proofreading tool, DeepSeek-R1 achieved performance comparable to human readers while requiring less time. Unlike human readers, whose performance may be affected by multitasking or fatigue during off-hours [<xref ref-type="bibr" rid="ref32">32</xref>-<xref ref-type="bibr" rid="ref34">34</xref>], DeepSeek-R1 provides stable and consistent output independent of such factors.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations. First, part of the evaluation used predefined correct and incorrect report sets in which 127 errors were deliberately introduced. Although the error categories were derived from commonly reported patterns, this synthetic design cannot fully capture the diversity and contextual complexity of errors in routine clinical practice. Our real-world validation in stage 4 provides a reference for naturalistic error frequency. Among 800 consecutive emergency radiology reports, 117 true reporting errors were identified, corresponding to an overall error rate of 14.6%, with omission being the most common error type. This prevalence is lower than the error density used in the synthetic datasets in stages 2&#x2010;3. The higher synthetic error density was intentionally adopted to create a controlled and sufficiently challenging evaluation environment. This design allowed statistically efficient comparison of error detection performance across AI models and human readers within a limited sample size. Accordingly, performance metrics from stages 2&#x2010;3 should be interpreted as reflecting stress-tested detection capability rather than real-world error prevalence. Although real-world validation yielded a moderate PPV of 56.5%, this result should be interpreted in the context of workflow efficiency. In our study, AI-assisted report review was faster than human-only proofreading. When false-positive alerts can be dismissed with brief verification, the additional workload introduced by false positives remains limited, and overall time efficiency may be preserved. Moreover, PPV is inherently dependent on error prevalence and should not be extrapolated across settings with different baseline error rates. From a workflow perspective, AI assistance shifts the task from exhaustive manual proofreading to alert-based verification. Despite the presence of false positives, this approach reallocates radiologists&#x2019; cognitive effort from error searching to decision confirmation. This shift may be particularly valuable in high-fatigue emergency settings.</p><p>Second, the exploratory analyses in stages 2&#x2010;3 were conducted in a simulated environment that differed from routine clinical practice. Factors such as artificially high error prevalence, lack of imaging data, long report length, and task instructions optimized for AI may have influenced reader performance. In addition, learning effects from repeated exposure to similar error patterns may have biased results. These factors should be considered when interpreting human-AI comparisons.</p><p>Third, the error taxonomy and definitions of false-positive subtypes involve some subjectivity. Although granular categorizations were applied to reduce bias, certain subtypes may not fully reflect errors encountered in routine practice. Subgroup analyses of rare error categories were also limited by small sample sizes and should be considered exploratory.</p><p>Fourth, although this study assessed time efficiency, it did not include a formal cost-effectiveness analysis. Reduced processing time suggests potential resource savings, but costs related to infrastructure, system integration, and maintenance were not evaluated.</p><p>Fifth, the experimental setting may have introduced a Hawthorne effect [<xref ref-type="bibr" rid="ref35">35</xref>], whereby radiologists&#x2019; awareness of observation temporarily enhanced performance. This may have led to overestimation of human error detection rates. Importantly, this does not undermine the clinical relevance of our findings. DeepSeek-R1 consistently outperformed resident radiologists, even under conditions likely to favor human readers, suggesting that its error detection capability is robust and may be more pronounced in routine, nonobserved clinical settings.</p><p>Finally, generalizability warrants consideration. Although this study included data from 3 tertiary hospitals with different reporting systems and template styles, all institutions were within the same health care system. Validation using dataset 2 supports robustness across institutions within Chinese emergency radiology; however, further validation in other hospitals and nonemergency settings is needed, particularly in environments with lower baseline error prevalence. In addition, the strong performance of DeepSeek-R1 is closely related to its optimization for Chinese clinical language. Direct 0-shot transfer to other languages may therefore be inappropriate, and language-specific optimization or models with stronger multilingual medical pretraining will likely be required.</p><p>Overall, while our findings support the robustness of DeepSeek-R1 across institutions and report templates within Chinese emergency radiology, generalization to other clinical settings or languages should be approached cautiously. Further prospective validation and language-specific optimization are required. Additional domain-specific fine-tuning using emergency medicine&#x2013;focused corpora may further improve performance in specialized settings such as trauma or neurologic imaging.</p></sec><sec id="s4-4"><title>Conclusion</title><p>DeepSeek-R1 represents a meaningful advance in automated quality control for radiology reports, particularly in Chinese emergency settings. Its ability to identify clinically significant errors with high efficiency supports its role as an assistive proofreading tool in modern radiology workflows.</p></sec></sec></body><back><ack><p>HS and TW contributed equally as co-first authors. SZ, DZ, and BZ served as corresponding authors.</p><p>No generative AI tools were used at any stage in the preparation of this manuscript.</p></ack><notes><sec><title>Funding</title><p>We acknowledge financial support from the National Key Research and Development Program of China (2023YFF1204600), the National Natural Science Foundation of China (82227802, 82302190, U21A6005), the Clinical Frontier Technology Program of the First Affiliated Hospital of Jinan University (No. JNU1AF-CFTP-2022-a01201), the Science and Technology Projects in Guangzhou (202201020022, 2023A03J1036, 2023A03J1038, 2025A04J7006), the National Key R&#x0026;D Program of China (2024YFA1012000, 2024YFC2417800), the Guangdong Basic and Applied Basic Research Foundation (2024A1515140146), the Outstanding Young Talents of Guangdong Special Support Program (Health Commission of Guangdong Province; 0720240213), the National Natural Science Foundation of China (62571228), the Beijing Natural Science Foundation (Z250002), and the Science and Technology Youth Talent Nurturing Program of Jinan University (21623209).</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: HS, TW, DZ, BZ</p><p>Data curation: FW, JF, YL, XW, SL, LC, QR</p><p>Formal analysis: JS, YZ, XL, LW, GM</p><p>Funding acquisition: SZ, DZ, BZ</p><p>Investigation: XM, JX, JY</p><p>Methodology: ZJ, XW, WH, XH</p><p>Project administration: HS, TW</p><p>Visualization: HS, TW</p><p>Writing&#x2014;original draft: HS, TW</p><p>Writing&#x2014;review and editing: all authors</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CT</term><def><p>computed tomography</p></def></def-item><def-item><term id="abb3">FPRR</term><def><p>false positive report rate</p></def></def-item><def-item><term id="abb4">LLM</term><def><p> large language model</p></def></def-item><def-item><term id="abb5">MRI</term><def><p>magnetic resonance imaging</p></def></def-item><def-item><term id="abb6">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb7">TPR</term><def><p>true positive rate</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Selvarajan</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Levin</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Parker</surname><given-names>L</given-names> </name></person-group><article-title>The increasing use of emergency department imaging in the United States: is it appropriate?</article-title><source>AJR Am J Roentgenol</source><year>2019</year><month>10</month><volume>213</volume><issue>4</issue><fpage>W180</fpage><lpage>W184</lpage><pub-id pub-id-type="doi">10.2214/AJR.19.21386</pub-id><pub-id pub-id-type="medline">31237433</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanna</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Lamoureux</surname><given-names>C</given-names> </name><name name-style="western"><surname>Krupinski</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Weber</surname><given-names>S</given-names> </name><name name-style="western"><surname>Johnson</surname><given-names>JO</given-names> </name></person-group><article-title>Effect of shift, schedule, and volume on interpretive accuracy: a retrospective analysis of 2.9 million radiologic examinations</article-title><source>Radiology</source><year>2018</year><month>04</month><volume>287</volume><issue>1</issue><fpage>205</fpage><lpage>212</lpage><pub-id pub-id-type="doi">10.1148/radiol.2017170555</pub-id><pub-id pub-id-type="medline">29156150</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alexander</surname><given-names>R</given-names> </name><name name-style="western"><surname>Waite</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bruno</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>Mandating limits on workload, duty, and speed in radiology</article-title><source>Radiology</source><year>2022</year><month>08</month><volume>304</volume><issue>2</issue><fpage>274</fpage><lpage>282</lpage><pub-id pub-id-type="doi">10.1148/radiol.212631</pub-id><pub-id pub-id-type="medline">35699581</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bruls</surname><given-names>RJM</given-names> </name><name name-style="western"><surname>Kwee</surname><given-names>RM</given-names> </name></person-group><article-title>Workload for radiologists during on-call hours: dramatic increase in the past 15 years</article-title><source>Insights Imaging</source><year>2020</year><month>11</month><day>23</day><volume>11</volume><issue>1</issue><fpage>121</fpage><pub-id pub-id-type="doi">10.1186/s13244-020-00925-z</pub-id><pub-id pub-id-type="medline">33226490</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kwee</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Toxopeus</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kwee</surname><given-names>TC</given-names> </name></person-group><article-title>Imaging overuse in the emergency department: the view of radiologists and emergency physicians</article-title><source>Eur J Radiol</source><year>2024</year><month>07</month><volume>176</volume><fpage>111536</fpage><pub-id pub-id-type="doi">10.1016/j.ejrad.2024.111536</pub-id><pub-id pub-id-type="medline">38820950</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McDonald</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Schwartz</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Eckel</surname><given-names>LJ</given-names> </name><etal/></person-group><article-title>The effects of changes in utilization and technological advancements of cross-sectional imaging on radiologist workload</article-title><source>Acad Radiol</source><year>2015</year><month>09</month><volume>22</volume><issue>9</issue><fpage>1191</fpage><lpage>1198</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2015.05.007</pub-id><pub-id pub-id-type="medline">26210525</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="web"><article-title>Radiology facing a global shortage</article-title><source>RSNA News</source><year>2022</year><month>05</month><day>10</day><access-date>2026-03-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.rsna.org/news/2022/may/global-radiologist-shortage">https://www.rsna.org/news/2022/may/global-radiologist-shortage</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Geijer</surname><given-names>H</given-names> </name><name name-style="western"><surname>Geijer</surname><given-names>M</given-names> </name></person-group><article-title>Added value of double reading in diagnostic radiology,a systematic review</article-title><source>Insights Imaging</source><year>2018</year><month>06</month><volume>9</volume><issue>3</issue><fpage>287</fpage><lpage>301</lpage><pub-id pub-id-type="doi">10.1007/s13244-018-0599-0</pub-id><pub-id pub-id-type="medline">29594850</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Claude 3.5 sonnet</article-title><source>Anthropic</source><year>2024</year><month>06</month><day>21</day><access-date>2026-03-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-3-5-sonnet">https://www.anthropic.com/news/claude-3-5-sonnet</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>Hello GPT-4o</article-title><source>OpenAI</source><year>2024</year><month>05</month><day>13</day><access-date>2026-03-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/index/hello-gpt-4o">https://openai.com/index/hello-gpt-4o</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>RA</given-names> </name><name name-style="western"><surname>Seah</surname><given-names>JCY</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yeung</surname><given-names>J</given-names> </name></person-group><article-title>Generative large language models for detection of speech recognition errors in radiology reports</article-title><source>Radiol Artif Intell</source><year>2024</year><month>03</month><volume>6</volume><issue>2</issue><fpage>e230205</fpage><pub-id pub-id-type="doi">10.1148/ryai.230205</pub-id><pub-id pub-id-type="medline">38265301</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gertz</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Dratsch</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bunck</surname><given-names>AC</given-names> </name><etal/></person-group><article-title>Potential of GPT-4 for detecting errors in radiology reports: implications for reporting accuracy</article-title><source>Radiology</source><year>2024</year><month>04</month><volume>311</volume><issue>1</issue><fpage>e232714</fpage><pub-id pub-id-type="doi">10.1148/radiol.232714</pub-id><pub-id pub-id-type="medline">38625012</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>HJ</given-names> </name><etal/></person-group><article-title>Large-scale validation of the feasibility of GPT-4 as a proofreading tool for head CT reports</article-title><source>Radiology</source><year>2025</year><month>01</month><volume>314</volume><issue>1</issue><fpage>e240701</fpage><pub-id pub-id-type="doi">10.1148/radiol.240701</pub-id><pub-id pub-id-type="medline">39873601</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>B</given-names> </name><etal/></person-group><article-title>The use of large language models in detecting Chinese ultrasound report errors</article-title><source>NPJ Digit Med</source><year>2025</year><month>01</month><day>28</day><volume>8</volume><issue>1</issue><fpage>66</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01468-7</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cozzi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pinker</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hidber</surname><given-names>A</given-names> </name><etal/></person-group><article-title>BI-RADS category assignments by GPT-3.5, GPT-4, and Google Bard: a multilanguage study</article-title><source>Radiology</source><year>2024</year><month>04</month><volume>311</volume><issue>1</issue><fpage>e232133</fpage><pub-id pub-id-type="doi">10.1148/radiol.232133</pub-id><pub-id pub-id-type="medline">38687216</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lai</surname><given-names>V</given-names> </name><name name-style="western"><surname>Ngo</surname><given-names>N</given-names> </name><name name-style="western"><surname>Veyseh</surname><given-names>APB</given-names> </name><etal/></person-group><article-title>ChatGPT beyond english: towards a comprehensive evaluation of large language models in multilingual learning</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 12, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2304.05613</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibney</surname><given-names>E</given-names> </name></person-group><article-title>Scientists flock to DeepSeek: how they&#x2019;re using the blockbuster AI model</article-title><source>Nature</source><year>2025</year><month>01</month><day>29</day><pub-id pub-id-type="doi">10.1038/d41586-025-00275-0</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Conroy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mallapaty</surname><given-names>S</given-names> </name></person-group><article-title>How China created AI model DeepSeek and shocked the world</article-title><source>Nature</source><year>2025</year><month>02</month><day>13</day><volume>638</volume><issue>8050</issue><fpage>300</fpage><lpage>301</lpage><pub-id pub-id-type="doi">10.1038/d41586-025-00259-0</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning</article-title><source>Nature</source><year>2025</year><month>09</month><volume>645</volume><issue>8081</issue><fpage>633</fpage><lpage>638</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-09422-z</pub-id><pub-id pub-id-type="medline">40962978</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><source>Research randomizer</source><access-date>2026-03-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.randomizer.org/">https://www.randomizer.org/</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Quint</surname><given-names>LE</given-names> </name><name name-style="western"><surname>Quint</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Myles</surname><given-names>JD</given-names> </name></person-group><article-title>Frequency and spectrum of errors in final radiology reports generated with automatic speech recognition technology</article-title><source>J Am Coll Radiol</source><year>2008</year><month>12</month><volume>5</volume><issue>12</issue><fpage>1196</fpage><lpage>1199</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2008.07.005</pub-id><pub-id pub-id-type="medline">19027683</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ringler</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Goss</surname><given-names>BC</given-names> </name><name name-style="western"><surname>Bartholmai</surname><given-names>BJ</given-names> </name></person-group><article-title>Syntactic and semantic errors in radiology reports associated with speech recognition software</article-title><source>Health Informatics J</source><year>2017</year><month>03</month><volume>23</volume><issue>1</issue><fpage>3</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1177/1460458215613614</pub-id><pub-id pub-id-type="medline">26635322</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vosshenrich</surname><given-names>J</given-names> </name><name name-style="western"><surname>Nesic</surname><given-names>I</given-names> </name><name name-style="western"><surname>Cyriac</surname><given-names>J</given-names> </name><name name-style="western"><surname>Boll</surname><given-names>DT</given-names> </name><name name-style="western"><surname>Merkle</surname><given-names>EM</given-names> </name><name name-style="western"><surname>Heye</surname><given-names>T</given-names> </name></person-group><article-title>Revealing the most common reporting errors through data mining of the report proofreading process</article-title><source>Eur Radiol</source><year>2021</year><month>04</month><volume>31</volume><issue>4</issue><fpage>2115</fpage><lpage>2125</lpage><pub-id pub-id-type="doi">10.1007/s00330-020-07306-6</pub-id><pub-id pub-id-type="medline">32997178</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><source>Wenjuanxing - Online survey platform</source><access-date>2026-03-17</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.wjx.cn/">https://www.wjx.cn/</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wallis</surname><given-names>S</given-names> </name></person-group><article-title>Binomial confidence intervals and contingency tests: mathematical fundamentals and the evaluation of alternative methods</article-title><source>J Quant Linguist</source><year>2013</year><month>08</month><volume>20</volume><issue>3</issue><fpage>178</fpage><lpage>208</lpage><pub-id pub-id-type="doi">10.1080/09296174.2013.799918</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>J</given-names> </name></person-group><article-title>A coefficient of agreement for nominal scales</article-title><source>Educ Psychol Meas</source><year>1960</year><month>04</month><volume>20</volume><issue>1</issue><fpage>37</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.1177/001316446002000104</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salam</surname><given-names>B</given-names> </name><name name-style="western"><surname>St&#x00FC;we</surname><given-names>C</given-names> </name><name name-style="western"><surname>Nowak</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Large language models for error detection in radiology reports: a comparative analysis between closed-source and privacy-compliant open-source models</article-title><source>Eur Radiol</source><year>2025</year><month>08</month><volume>35</volume><issue>8</issue><fpage>4549</fpage><lpage>4557</lpage><pub-id pub-id-type="doi">10.1007/s00330-025-11438-y</pub-id><pub-id pub-id-type="medline">39979623</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lexa</surname><given-names>FJ</given-names> </name><name name-style="western"><surname>Jha</surname><given-names>S</given-names> </name></person-group><article-title>Artificial intelligence for image interpretation: counterpoint-the radiologist&#x2019;s incremental foe</article-title><source>AJR Am J Roentgenol</source><year>2021</year><month>09</month><volume>217</volume><issue>3</issue><fpage>558</fpage><lpage>559</lpage><pub-id pub-id-type="doi">10.2214/AJR.21.25484</pub-id><pub-id pub-id-type="medline">33533639</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Coppola</surname><given-names>F</given-names> </name><name name-style="western"><surname>Faggioni</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gabelloni</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Human, all too human? An all-around appraisal of the &#x201C;artificial intelligence revolution&#x201D; in medical imaging</article-title><source>Front Psychol</source><year>2021</year><volume>12</volume><fpage>710982</fpage><pub-id pub-id-type="doi">10.3389/fpsyg.2021.710982</pub-id><pub-id pub-id-type="medline">34650476</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rau</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rau</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zoeller</surname><given-names>D</given-names> </name><etal/></person-group><article-title>A context-based chatbot surpasses trained radiologists and generic ChatGPT in following the ACR appropriateness guidelines</article-title><source>Radiology</source><year>2023</year><month>07</month><volume>308</volume><issue>1</issue><fpage>e230970</fpage><pub-id pub-id-type="doi">10.1148/radiol.230970</pub-id><pub-id pub-id-type="medline">37489981</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Russe</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Fink</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ngo</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT, human radiologists, and context-aware ChatGPT in identifying AO codes from radiology reports</article-title><source>Sci Rep</source><year>2023</year><month>08</month><day>30</day><volume>13</volume><issue>1</issue><fpage>14215</fpage><pub-id pub-id-type="doi">10.1038/s41598-023-41512-8</pub-id><pub-id pub-id-type="medline">37648742</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanna</surname><given-names>TN</given-names> </name><name name-style="western"><surname>Zygmont</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Peterson</surname><given-names>R</given-names> </name><etal/></person-group><article-title>The effects of fatigue from overnight shifts on radiology search patterns and diagnostic performance</article-title><source>J Am Coll Radiol</source><year>2018</year><month>12</month><volume>15</volume><issue>12</issue><fpage>1709</fpage><lpage>1716</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2017.12.019</pub-id><pub-id pub-id-type="medline">29366599</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krupinski</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Berbaum</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Caldwell</surname><given-names>RT</given-names> </name><name name-style="western"><surname>Schartz</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name></person-group><article-title>Long radiology workdays reduce detection and accommodation accuracy</article-title><source>J Am Coll Radiol</source><year>2010</year><month>09</month><volume>7</volume><issue>9</issue><fpage>698</fpage><lpage>704</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2010.03.004</pub-id><pub-id pub-id-type="medline">20816631</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sistrom</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Slater</surname><given-names>RM</given-names> </name><name name-style="western"><surname>Rajderkar</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Grajo</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Rees</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Mancuso</surname><given-names>AA</given-names> </name></person-group><article-title>Full resolution simulation for evaluation of critical care imaging interpretation; part 1: fixed effects identify influences of exam, specialty, fatigue, and training on resident performance</article-title><source>Acad Radiol</source><year>2020</year><month>07</month><volume>27</volume><issue>7</issue><fpage>1006</fpage><lpage>1015</lpage><pub-id pub-id-type="doi">10.1016/j.acra.2019.11.023</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kidwai</surname><given-names>AS</given-names> </name><name name-style="western"><surname>Abujudeh</surname><given-names>HH</given-names> </name></person-group><article-title>Radiologist productivity increases with real-time monitoring: the Hawthorne effect</article-title><source>J Am Coll Radiol</source><year>2015</year><month>11</month><volume>12</volume><issue>11</issue><fpage>1151</fpage><lpage>1154</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2015.03.041</pub-id><pub-id pub-id-type="medline">26208406</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Example for metric calculation. Table S1. Detailed prompts parameters of large language models used in this study; Table S2. Clinical impact and definition of false positive responses generated by DeepSeek-R1; Table S3. Example dataset for metric calculation; Table S4. Interrater agreement between LLMs and radiologists in a 0-shot setting; Table S5. Interrater agreement between LLMs and radiologists in a few-shot setting; Figure S1. The examples of various errors; Figure S2. Distribution of the 5 error types for both real and artificial errors (datasets 1 and 2); and Figure S3. Time efficiency analysis. Bar graphs comparing reading time for error detection for the large language models and radiologists.</p><media xlink:href="jmir_v28i1e86841_app1.docx" xlink:title="DOCX File, 7082 KB"/></supplementary-material></app-group></back></article>