<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e86692</article-id><article-id pub-id-type="doi">10.2196/86692</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Vision-Enabled Large Language Models in Image-Based Electrocardiogram Interpretation: Exploratory Evaluation</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Soubh</surname><given-names>Nibras</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Rasenack</surname><given-names>Eva</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Haarmann</surname><given-names>Helge</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wiedmann</surname><given-names>Felix</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zabel</surname><given-names>Markus</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Schmidt</surname><given-names>Constanze</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Suliman</surname><given-names>Rayan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Bergau</surname><given-names>Leonard</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Cardiology and Pneumology, University Medical Center G&#x00F6;ttingen (UMG)</institution><addr-line>Robert-Koch-Str. 40</addr-line><addr-line>G&#x00F6;ttingen</addr-line><country>Germany</country></aff><aff id="aff2"><institution>German Centre for Cardiovascular Research, Partner Site Lower Saxony, University of G&#x00F6;ttingen</institution><addr-line>G&#x00F6;ttingen</addr-line><country>Germany</country></aff><aff id="aff3"><institution>Institute of Clinical Chemistry and Laboratory Medicine, Municipal Hospital Dresden</institution><addr-line>Dresden</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Vadathya</surname><given-names>Anil Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Okolie</surname><given-names>Awele</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Patel</surname><given-names>Birjukumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Komut</surname><given-names>Seval</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Liu</surname><given-names>Zhao</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Nibras Soubh, MD, Department of Cardiology and Pneumology, University Medical Center G&#x00F6;ttingen (UMG), Robert-Koch-Str. 40, G&#x00F6;ttingen, 37075, Germany, 49 15114609645; <email>nibras.soubh@med.uni-goettingen.de</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>3</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e86692</elocation-id><history><date date-type="received"><day>31</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>23</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>24</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Nibras Soubh, Eva Rasenack, Helge Haarmann, Felix Wiedmann, Markus Zabel, Constanze Schmidt, Rayan Suliman, Leonard Bergau. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 3.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e86692"/><abstract><sec><title>Background</title><p>Vision-enabled large language models (VE-LLMs) have the potential to provide flexible and explainable medical image interpretation. However, their real-world performance on clinical data, such as 12-lead electrocardiograms (ECGs), has not been systematically assessed.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the diagnostic accuracy and reliability of state-of-the-art generalist VE-LLMs in interpreting real-world ECG images.</p></sec><sec sec-type="methods"><title>Methods</title><p>We tested 6 generalist VE-LLMs (ChatGPT-5, ChatGPT-4, Gemini 2.5, Copilot, Claude Sonnet-4, and Claude Opus-4.1) using 70 deidentified ECG images. A standardized prompt requested 9 determinations: rhythm, first-degree atrioventricular (AV) block, intraventricular conduction block and pattern, corrected QT (QTc) prolongation, premature atrial and ventricular contractions, ischemic ST-segment deviation, and axis deviation. An expert consensus served as the reference standard. Moreover, 2 image-based ECG-specialized LLMs (PULSE-7B and ECG-Instruct-Llama-3.2-11B-Vision) were tested for exploratory comparison. Model outputs were evaluated using overall and per-category diagnostic metrics.</p></sec><sec sec-type="results"><title>Results</title><p>Overall balanced accuracy across generalist models ranged from 50.1% to 61.8% (Cochran Q, <italic>P</italic>&#x003C;.001). ChatGPT-5 achieved the highest balanced accuracy (61.8%) but had the slowest response time (median 276, IQR 110-407 s), whereas Copilot responded within a median of 3 (IQR 2-4) seconds. Balanced accuracy for rhythm classification ranged from 38.6% to 55.8%, but sensitivity for atrial fibrillation among generalist models was &#x2264;11.1%, detecting either none or only 1 of the 9 cases. Detection of first-degree AV block (sensitivity 0%&#x2010;22%; 0/9 to 2/9) and QTc prolongation (sensitivity 0%&#x2010;45.5%; 0/22 to 10/22) was poor. Intraventricular block was identified with up to 67.8% balanced accuracy, but correct subtype assignment was &#x2264;44% (&#x2264;11/25). ST-segment deviation sensitivity was &#x003C;25% for all generalist models (highest 3/14). Agreement with expert interpretation was low, with Cohen &#x03BA; indicating poor-to-fair concordance (&#x03BA;&#x2264;0.39). Specialized models achieved overall balanced accuracy of 56.5% (ECG-Instruct-Llama-3.2-11B-Vision) and 64.4% (PULSE-7B), with PULSE-7B showing higher task-specific balanced accuracy in rhythm classification and ectopic beats detection (up to 86.3% and 89.2%, respectively).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>VE-LLMs showed moderate overall performance but mostly low sensitivity and limited agreement with expert ECG interpretation. Current performance remains inconsistent across models and diagnostic categories and is insufficient to support clinical deployment.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>electrocardiography</kwd><kwd>ECG</kwd><kwd>machine learning</kwd><kwd>large language models</kwd><kwd>LLMs</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Large language models (LLMs) are artificial intelligence (AI) systems trained on vast text corpora to generate human-like responses. The public release of the first generalist LLM&#x2014;OpenAI&#x2019;s ChatGPT&#x2014;in late 2022 marked a major inflection point. In just 2 years, LLMs have influenced many aspects of daily life and professional practice, with successive generations steadily expanding conversational and problem-solving capabilities. These models are now being explored across education, law, and scientific research, reshaping workflows while raising new societal and ethical questions.</p><p>In medicine, LLMs have shown notable promise on knowledge-based tasks. For example, Google&#x2019;s Med-PaLM and Med-PaLM 2 achieved scores exceeding the passing threshold for the USMLE (United States Medical Licensing Examination) [<xref ref-type="bibr" rid="ref1">1</xref>]. Meanwhile, multiple studies evaluating publicly accessible LLMs across international medical examinations indicate that newer versions can perform well, although results vary by model, language, and examination difficulty [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. Beyond examinations, LLMs are being piloted for clinical use cases, such as drafting documentation, summarizing literature, and answering patient queries in lay terms [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, clinical deployment requires rigorous validation and appropriate regulatory frameworks, particularly for data protection and patient privacy.</p><p>The 12-lead electrocardiogram (ECG) is a cornerstone diagnostic tool in cardiology and emergency medicine. Recent advances in AI&#x2014;especially deep learning&#x2014;have yielded strong results in automated ECG analysis. Convolutional neural network (CNN) models trained on large ECG datasets can achieve expert-level interpretation and may uncover latent features not apparent to human readers. For instance, multiple AI ECG models have been shown to predict the near-term onset of atrial fibrillation (AF) from ECGs recorded in apparent sinus rhythm (SR), detecting subclinical signatures invisible to clinicians [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. EchoNext&#x2014;a deep learning model trained on more than one million ECG waveforms and imaging records&#x2014;outperformed cardiologists in detecting structural heart disease across diverse clinical settings and patient groups [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>This emerging field of AI-enhanced ECG analysis promises faster and potentially more sensitive screening for cardiac pathology using a ubiquitous, inexpensive test. Despite progress, most AI ECG solutions remain task specific, have limited accessibility, require waveform (signal data) as input, and lack the flexible reasoning of human interpreters. Integrating advanced LLMs into image-based ECG interpretation could combine the pattern recognition strengths of deep learning with the linguistic and contextual reasoning of LLMs. Historically, generalist LLMs accepted only text input. Newer models are multimodal, accepting images in addition to text. GPT-4 with &#x201C;Vision&#x201D; (released in 2023) demonstrated that an LLM augmented with a visual encoder can describe images and answer questions about them. Competing vision-enabled LLMs (VE-LLMs) soon followed. Early experience showed impressive performance on general images&#x2014;object recognition, chart interpretation, handwriting&#x2014;yet the ability of generalist VE-LLMs to interpret complex medical images remains uncertain, with only a few studies assessing earlier versions on simplified, textbook ECG test sets [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Previous studies evaluating ChatGPT-4 have largely focused on single-model assessments, limited diagnostic tasks, or simplified test settings, often without structured benchmarking across multiple models and clinically derived datasets. Given the rapid evolution of LLM capabilities, we conducted a comprehensive evaluation of the newest VE-LLMs as of August 2025 (including ChatGPT-5 and Claude Opus-4.1) on a clinically relevant image understanding task: 12-lead ECG interpretation. We focused on fundamental ECG findings&#x2014;rhythm, atrioventricular (AV) and intraventricular conduction delays (IVCDs), ectopy, and ischemia&#x2014;routinely assessed by clinicians and automated algorithms. By comparing 6 prominent generalist models side-by-side, we sought to determine how performance varies across models and diagnostic categories and how models compare with human experts and with 2 further ECG-specialized image-based LLMs (PULSE-7B and ECG-Instruct-Llama-3.2-11B-Vision).</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This retrospective diagnostic study was conducted at the University Medical Center G&#x00F6;ttingen, Germany, using deidentified 12-lead ECG images interpreted by AI models, with expert human interpretation as the reference standard. The study was reviewed and approved by the local institutional ethics committee (approval 18/08/24). In accordance with the ethics committee decision and given the retrospective design and the use of fully anonymized data, the requirement for informed consent was waived. All data were handled in accordance with institutional data protection policies, and no identifiable personal information was included in the dataset or study outputs. No participants received financial compensation.</p></sec><sec id="s2-2"><title>ECG Data&#x2014;Acquisition and Preparation</title><p>Seventy 12-lead ECGs were collected from consecutive patients admitted to the heart rhythm ward between June 1 and 15, 2025. Recordings were obtained using the standard hospital device (Cardiovit AT-102 G2; Schiller Medizintechnik GmbH) at 500 Hz, with a paper speed of 50 mm/s and 10 mm/mV calibration. Each ECG was originally in PDF format and included patient metadata and automated interpretations. Patient information, demographics, and machine-generated measurements were removed using Inkscape (Inkscape Project, version 1.3.2), leaving only the waveform grid and tracings. Images were exported as high-resolution JPEG files (300 dots per inch). The final dataset contained 70 ECG images, each a single page with the standard 12 leads (I, II, III, aVR, aVL, aVF, and V1-V6). Two physicians independently reviewed all ECGs to determine the correct findings for each of the 9 diagnostic questions (detailed in the following section). Discrepancies were resolved by consensus discussion. These expert interpretations served as the reference standard. Interval assessments (PR, QRS, and QTc) by expert readers were performed using manual measurements from the ECG images based on standard calibration. Difficulty of ECGs was classified into three categories: (1) easy: normal ECG or isolated clear pathological findings with no baseline disturbances or borderline measurements; (2) intermediate: borderline measurements, subtle findings, baseline artifacts, and the co-occurrence of multiple pathological findings; and (3) difficult: multiple or rare pathological findings and multiple borderline measurements.</p></sec><sec id="s2-3"><title>Selection and Inference of LLM Models</title><p>We identified 7 publicly available generalist multimodal LLMs (accepting image and text input) that represented the top-tier models from major AI developers: ChatGPT-4 and ChatGPT-5 (OpenAI), Gemini 2.5 (Google), Copilot (Microsoft; GPT-4-based, with &#x201C;Think Deeper&#x201D; activated), Grok-4 (xAI), Claude Sonnet-4, and Claude Opus-4.1 (Anthropic).</p><p>We excluded Meta AI&#x2019;s LLM, as no publicly available multimodal model was provided as of August 2025, and DeepSeek because its vision-enabled chat model is limited to text extraction from images as stated by the model itself. Grok-4 inference was performed via the Perplexity Pro platform; however, it was excluded from final analysis following the disclosure of a &#x201C;silent fallback bug&#x201D; by the platform provider. Due to this technical issue, queries intended for Grok-4 were intermittently routed to ChatGPT-4 without user notification, confounding the model-specific results. All other models were inferred via their public web interfaces using default settings and within their usage policies and rate limits. As inference was conducted through publicly accessible web interfaces rather than APIs, exact backend model version identifiers were not exposed to users. Model provenance was therefore documented using platform name, model designation, subscription level, geographic inference location, client environment, and precise inference dates (Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), consistent with current reporting practices for evaluations of publicly deployed multimodal systems.</p><p>Additionally, we evaluated 2 open-source LLMs specialized for ECG image interpretation: ECG-Instruct-Llama-3.2-11B-Vision (ECG-Instruct-Llama-3.2), a multimodal Llama-3.2 model adapted to ECGs using parameter-efficient Low-Rank Adaptation (LoRA) fine-tuning [<xref ref-type="bibr" rid="ref11">11</xref>]; and PULSE-7B, a smaller LLaVA-based vision language model instruction tuned on ECG image&#x2013;text data [<xref ref-type="bibr" rid="ref12">12</xref>]. Model inference was performed in a cloud-based high-performance computing environment (Google Colab Pro+, Google LLC), with detailed environment and inference settings provided in Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-4"><title>Prompt Design and Results Mapping</title><p>Pilot testing was performed on 20 ECGs (not included in the study&#x2019;s dataset). Short prompts often produced incomplete or pattern-based answers. To mitigate this, we optimized a prompt to explicitly instruct step-by-step visual analysis. We incorporated advice from the models themselves and known best practices for eliciting analytical responses. Broadly applicable models&#x2019; suggestions (eg, emphasizing lead-by-lead analysis and the explicit use of visual detail) were incorporated, while the final standardized prompt was defined by the investigators and applied uniformly across models. The final prompt given to all models for each ECG was as follows:</p><p>&#x201C;Please perform a structured, lead-by-lead visual analysis of this ECG image (paper speed: 50 mm/s, amplitude: 10 mm/mV). For each lead (I, II, III, aVR, aVL, aVF, V1&#x2013;V6), identify the P, QRS, T waves and describe their durations and characteristics. Do not rely on pattern-matching shortcuts or global assumptions &#x2013; use the detailed visual information from the image. After analyzing, answer the following numbered questions (with a single answer each):</p><list list-type="order"><list-item><p>What is the cardiac rhythm? (Choose one: &#x201C;sinus rhythm,&#x201D; &#x201C;atrial fibrillation,&#x201D; &#x201C;paced rhythm,&#x201D; or &#x201C;other&#x201D;)</p></list-item><list-item><p>Is there a first-degree AV block? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;)</p></list-item><list-item><p>Is an intraventricular conduction block present? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;)</p></list-item><list-item><p>If yes, which type of bundle branch block is it? (Choose &#x201C;RBBB,&#x201D; &#x201C;LBBB,&#x201D; or &#x201C;nonspecific&#x201D;)</p></list-item><list-item><p>Is the QTc interval &#x003E;450 ms (QT prolongation)? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;)</p></list-item><list-item><p>Are there any premature supraventricular beats (PACs)? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;)</p></list-item><list-item><p>Are there any premature ventricular beats (PVCs)? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;)</p></list-item><list-item><p>Are there significant ST-segment deviations suggestive of myocardial ischemia? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;)</p></list-item><list-item><p>Is the QRS axis abnormal (deviated beyond +90&#x00B0; or &#x2013;30&#x00B0;)? (&#x201C;yes&#x201D; or &#x201C;no&#x201D;).&#x201D;</p></list-item></list><p>We explicitly mentioned calibration and discouraged shortcuts to encourage actual measurement. We also constrained answers to the required format (yes or no or single option) for easier evaluation. All models received the exact same prompt text with no vendor-specific adjustments to default system settings. If a model&#x2019;s answer did not follow the format (eg, gave explanations or multiple options), we recorded its final explicit answer for each numbered item. No further prompt engineering or follow-up questions were applied. Each model thus output 9 answers per ECG, which we collected for analysis. Authors extracting and mapping model outputs were blinded to the correct answers. Additionally, we measured the response latency for each generalist model: specifically, the time from submitting the prompt to the model beginning to output the first word of its answer.</p><p>Given their substantially smaller size and domain-specific optimization compared with frontier generalist multimodal LLMs&#x2014;which are typically optimized for complex, multistep prompting&#x2014;we additionally evaluated the ECG-specialized models using a short &#x201C;impression-style&#x201D; prompt to provide a model-appropriate testing condition. The short prompt was as follows:</p><p>&#x201C;Analyze this 12-lead ECG and provide a concise cardiology-style impression (rhythm, conduction, axis, QTc, ectopy, ischemia)&#x201D;</p><p>Generalist LLMs and the standard-prompted PULSE-7B adhered to the structured prompt and usually produced numbered answers in the requested format without contradictory or missing responses, although isolated deviations from the predefined option set required minimal manual mapping during dataset construction (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). The ECG-Instruct-Llama-3.2 model did not consistently follow the structured response format and frequently generated narrative ECG reports instead of discrete answers. Therefore, a postprocessing step was applied for this model and for the short-prompted PULSE-7B. Model outputs were manually reviewed and mapped to predefined diagnostic categories. A category was recorded as positive only when a corresponding finding (eg, bundle branch block and QT prolongation) was stated explicitly and unequivocally. If a finding was not mentioned or described ambiguously, it was classified as negative for that category.</p></sec><sec id="s2-5"><title>Outcome Measures</title><p>The reference answers were (1) rhythm classified as SR, AF, paced, or other; (2) first-degree AV block present or not (defined as PR interval &#x003E;200 ms); (3) intraventricular conduction block present or not (defined as QRS prolongation &#x003E;120 ms); (4) if an intraventricular conduction block present, type classified as right bundle branch block, left bundle branch block, or nonspecific IVCD. Performance metrics in this category were calculated as a multiclass classification across the full dataset (correct or incorrect), with &#x201C;no block&#x201D; treated as a separate class; (5) QT prolongation defined as corrected QT (QTc) &#x003E;450 ms (using Bazett&#x2019;s formula; a single threshold was applied as patient sex information was not available); (6) premature atrial contractions (PACs) present or not; (7) premature ventricular contractions (PVCs) present or not; (8) ST-segment changes significant for ischemia or not, measured at the J-point in contiguous leads, with a cutoff for depression of &#x2265;2 mm and elevation of &#x2265;1 mm in all leads except V2 and V3, where a cutoff of &#x2265;2 mm was applied due to unavailability of age and sex data; in the presence of bundle branch block or paced rhythm, modified Sgarbossa criteria were used; and (9) QRS axis abnormal or not (right axis &#x003E;+90&#x00B0; or left axis &#x003C;&#x2013;30&#x00B0;).</p></sec><sec id="s2-6"><title>Performance Metrics and Statistics</title><p>Given the exploratory nature of this study, no prospective power analysis was performed. A post-hoc calculation of the minimum detectable difference for the primary outcome indicated that with a sample size of 70, the study was powered (80%; &#x03B1;=.05) to detect differences of at least 22% in the overall accuracy. We calculated conventional classification metrics for each model on each question: accuracy, sensitivity, specificity, positive predictive value, negative predictive value, balanced accuracy, and <italic>F</italic><sub>1</sub>-score. These were computed with the binary interpretation (eg, for QT prolongation: &#x201C;yes&#x201D;=positive and &#x201C;no&#x201D;=negative). For multiclass categories, we reported overall accuracy. Ninety-five percent CIs were calculated using the Wilson method for proportions and bootstrap resampling where appropriate. To contextualize model performance relative to a naive baseline, we additionally compared each model with a majority class classifier (ZeroR). The Cochran Q test was used to investigate overall differences in accuracy among the models. Paired comparisons between each model and the majority class classifier were performed using the McNemar test with Bonferroni correction, and the adjusted <italic>P</italic> values were reported. To assess agreement between each model&#x2019;s classifications and the expert reference, Cohen &#x03BA; was used. &#x03BA; values were interpreted as &#x003C;0.20 (poor), 0.21 to 0.40 (fair), 0.41 to 0.60 (moderate), 0.61 to 0.80 (good), and &#x003E;0.80 (excellent agreement beyond chance). We also calculated the per-case total score for each model (out of 9 questions per ECG) as a crude measure of overall ECG interpretation performance. The median scores were compared between models. Finally, response times were summarized per model (median, IQR) as a measure of efficiency. For continuous or ordinal outcomes such as the per-case score and response time, we used Friedman ANOVA to test for global differences and Wilcoxon signed-rank tests for pairwise model comparisons. All hypothesis tests were conducted using a 2-sided approach, with statistical significance defined as <italic>P</italic>&#x003C;.05. Statistical analysis was performed using R (R Foundation for Statistical Computing, version 4.4.0); the analysis codes are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Dataset Characteristics</title><p>The 70 ECGs encompassed a wide spectrum of findings. Normal SR was present in 57 (81%) cases, AF in 9 (13%) cases, pacemaker rhythm in 3 (4%) cases, and AF with pacemaker stimulation in 1 (1%) case. First-degree AV block was present in 9 (13%) cases. Intraventricular conduction block was present in 25 (36%) cases&#x2013;including 9 with right bundle branch block, 8 with left bundle branch block, and 8 with nonspecific IVCD. QT prolongation was noted in 22 cases (31%). PACs were rare (n=3, 4%), whereas PVCs were seen in 10 (14%) cases. ST-segment changes suggestive of myocardial ischemia were present in 14 cases (20%). Abnormal QRS axis (right or left axis deviation) was present in 12 cases (17%). The anonymized ground-truth label matrix is provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. The interrater agreement between the 2 cardiologists was high, with an overall percent agreement of 97.3%. Category-specific agreement ranged from 94.2% (suspected ST-segment changes) to 100% (rhythm and PVCs). Cohen &#x03BA; indicated substantial to almost perfect reliability across domains, ranging from 0.80 for suspected ST-segment changes to 1.00 for rhythm and PVCs classification. Detailed interrater agreement metrics are provided in Tables S3 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p></sec><sec id="s3-2"><title>Model Response Times</title><p>User-perceived response latency varied strikingly across models (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Microsoft Copilot was the fastest, with a median response time of 3 (IQR 2-4) seconds. ChatGPT-4 was also relatively quick (median 5, IQR 4&#x2010;9 s). Anthropic&#x2019;s models were slower: Claude Sonnet-4 with a median of 24 (IQR 15&#x2010;36) seconds, and Claude Opus-4.1 with a median of 30 (IQR 25&#x2010;40) seconds. Gemini 2.5 responded in a median of 36 (IQR 27&#x2010;44) seconds. The slowest by far was ChatGPT-5, with a median latency of 276 seconds (4.6 min) and very high variability (IQR 110&#x2010;407 s). In 1 case, it took 14 minutes to begin responding. The difference spanned nearly 2 orders of magnitude, from seconds to minutes. Statistical testing confirmed a significant overall difference (Friedman <italic>&#x03C7;&#x00B2;</italic><sub>5</sub>=299.5; <italic>P&#x003C;</italic>.001; Kendall W=0.855). Pairwise comparisons showed ChatGPT-5 was significantly slower than every other model (adjusted <italic>P</italic>&#x003C;.001), and among the rest, Copilot was significantly faster than all other models (adjusted <italic>P</italic>&#x003C;.001).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Response times. Response times of generalist vision-enabled large language models (LLMs) when interpreting 12-lead electrocardiogram (ECG) images from a retrospective dataset of 70 deidentified ECGs collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany. Model inference was conducted in July to August 2025. The figure shows boxplots of the time each model required to begin generating an output.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86692_fig01.png"/></fig></sec><sec id="s3-3"><title>Overall Accuracy</title><p>The overall accuracy was estimated across all 9 diagnostic questions (computed as total correct answers out of 630 possible [70 ECGs&#x00D7;9 questions]). ChatGPT-5 achieved the highest aggregate accuracy (correct: 493/630, 78.3%) among generalist models, ahead of Claude Opus-4.1 (470/630, 74.6%) and Claude Sonnet-4 (464/630, 73.7%). Copilot was fourth (452/630, 71.7%), followed by Gemini 2.5 (445/630, 70.6%) and ChatGPT-4 (438/630, 69.5%). The range between the highest and lowest accuracy was relatively wide (approximately 10 percentage points), and the Cochran Q test on the full 6&#x00D7;630 binary outcome matrix was statistically significant (<italic>P&#x003C;</italic>.001). After correction for category imbalances, ChatGPT-5 also achieved the highest overall balanced accuracy (61.8%) and <italic>F</italic><sub>1</sub>-score (38%). Other models showed progressively lower balanced accuracy and <italic>F</italic><sub>1</sub>-scores reaching 50.1% and 13%, respectively (Claude Opus-4.1), as shown in Table S4 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. The ZeroR (majority class) classifier achieved 79% accuracy, higher than all other models. In McNemar testing with Bonferroni correction, a statistically significant asymmetry in discordant classifications favoring ZeroR was observed in all models except for ChatGPT-5 (Table S5 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). This aggregate view masks considerable variation in performance by question type, which we detail below. <xref ref-type="fig" rid="figure2">Figure 2</xref> provides an overview of each model&#x2019;s overall and category-specific balanced accuracy and agreement with human experts across the 9 diagnostic questions. ZeroR exceeded model per-task accuracy for all tasks except: ChatGPT-5 (4/9 tasks where model &#x003E;ZeroR) and ChatGPT-4 (1/9 task where model&#x003E;ZeroR). Detailed overall performance metrics, corresponding 95% CIs, the heatmap of unbalanced accuracy, and the full ZeroR comparisons across models and task categories are provided in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendices 5</xref> and <xref ref-type="supplementary-material" rid="app6">6</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Heatmaps of balanced accuracy and Cohen &#x03BA;. Balanced accuracy (left) and Cohen &#x03BA; for agreement with expert interpretation (right) across electrocardiogram (ECG) interpretation tasks, shown as heatmaps for all generalist vision-enabled large language models. The models were evaluated using a retrospective dataset of 70 deidentified 12-lead ECGs collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard; model inference was conducted in July to August 2025. Balanced accuracy values are presented as percentages, and both balanced accuracy and &#x03BA; are visualized using a color scale (green=higher values and orange or red=lower values). IV-Block: intraventricular block; IV-Block Pattern: intraventricular block pattern; QTc: corrected QT interval; PAC: premature atrial contraction; PVC: premature ventricular contraction.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86692_fig02.png"/></fig></sec><sec id="s3-4"><title>Models' Performance Across Difficulty Spectrum</title><p>The 70 ECGs were stratified by expert-rated difficulty into 21 (30%) level-1 (straightforward), 31 (44.3%) level-2 (intermediate), and 18 (25.7%) level-3 (challenging) tracings. A consistent and statistically significant decrease in diagnostic accuracy was observed as the complexity of the tracings increased. This pattern was monotonic for every individual model as shown in <xref ref-type="fig" rid="figure3">Figure 3</xref>. ChatGPT-5 achieved the highest metrics in the most challenging stratum, achieving an overall accuracy and balanced accuracy of 60.6% and 58.6%, respectively. Detailed overall accuracies, balanced accuracies, and <italic>F</italic><sub>1</sub>-scores stratified by ECG difficulty are provided in Tables S6-7 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Accuracy of vision-enabled generalist large language models stratified by electrocardiogram (ECG) difficulty level (levels 1&#x2010;3), reported as the proportion of correct responses with 95% CIs (error bars). Models were evaluated using a retrospective dataset of 70 deidentified 12-lead ECGs collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in July to August 2025. Statistical comparisons were performed per model using 2-sided Pearson <italic>&#x03C7;</italic>&#x00B2; tests on 3&#x00D7;2 contingency tables (correct vs incorrect responses) stratified by difficulty level.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86692_fig03.png"/></fig></sec><sec id="s3-5"><title>Rhythm Classification</title><p>Across all models, rhythm was correctly identified in 72.9% (n= 51/70) to 82.9% (n= 58) of cases. Statistical comparison showed a significant difference in the proportion of correct answers among models (the Cochran Q test, <italic>P</italic>&#x003C;.001). The highest performance metrics among generalist models were achieved by Claude Sonnet-4 and ChatGPT-4 with the following accuracy, balanced accuracy, and agreement with human experts (&#x03BA;): 82.9%, 53.8%, and 0.12 and 81.4%, 55.8%, and 0.29, respectively. All generalist models showed high sensitivity for identifying SR (89%&#x2010;100%) but very low sensitivity for AF (&#x2264;11.1%). Most generalist LLMs failed to detect any AF cases. Only Copilot and Claude Sonnet-4 detected 1 of 9 (sensitivity 11%). Despite higher accuracies, other adjusted diagnostic metrics, balanced accuracy and agreement with the human experts, were rather low and indicated poor performance as shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Measures regarding rhythm classification<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Measure</td><td align="left" valign="top">OpenAI ChatGPT-4</td><td align="left" valign="top">OpenAI ChatGPT-5</td><td align="left" valign="top">Google Gemini 2.5</td><td align="left" valign="top">Microsoft Copilot</td><td align="left" valign="top">Claude Sonnet-4</td><td align="left" valign="top">Claude Opus-4.1</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic metric</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy, % (95% CI)</td><td align="char" char="." valign="top">81.4 (70.8&#x2010;88.8)</td><td align="char" char="." valign="top">72.9 (61.5&#x2010;81.9)</td><td align="char" char="." valign="top">78.6 (67.6&#x2010;86.6)</td><td align="char" char="." valign="top">81.4 (70.8&#x2010;88.8)</td><td align="char" char="." valign="top">82.9 (72.4&#x2010;89.9)</td><td align="char" char="." valign="top">81.4 (70.8&#x2010;88.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Balanced accuracy, % (95% CI)</td><td align="char" char="." valign="top">55.8 (44.2&#x2010;70.6)</td><td align="char" char="." valign="top">38.6 (33.3&#x2010;43.4)</td><td align="char" char="." valign="top">50.3 (42.9&#x2010;61)</td><td align="char" char="." valign="top">52 (45.3&#x2010;63.6)</td><td align="char" char="." valign="top">53.8 (50&#x2010;62.5)</td><td align="char" char="." valign="top">50 (50&#x2010;50)</td></tr><tr><td align="left" valign="top">Agreement measure</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Cohen &#x03BA;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="char" char="." valign="top">0.29</td><td align="char" char="." valign="top">0.39</td><td align="char" char="." valign="top">0.14</td><td align="char" char="." valign="top">0.18</td><td align="char" char="." valign="top">0.12</td><td align="char" char="." valign="top">0</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Performance of generalist vision-enabled large language models in classifying cardiac rhythm from 70 deidentified 12-lead electrocardiograms collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in July to August 2025.</p></fn><fn id="table1fn2"><p><sup>b</sup>Agreement with expert interpretation measured using Cohen &#x03BA; test.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Identification of First-Degree AV Block</title><p>Accuracy for detecting first-degree AV block ranged from 75.7% (Claude Opus-4.1) to 87.1% (ChatGPT-5 and Claude Sonnet-4). However, these high accuracies primarily reflected a tendency of many models to answer &#x201C;no&#x201D; for all cases, thereby correctly classifying true negatives but missing all positives. Balanced accuracy dropped in all models with the lowest in Claude Opus-4.1 (43.4%) and the highest in ChatGPT-4 (57.8%). Agreement with human experts, assessed using Cohen &#x03BA;, was generally poor, ranging from &#x03BA;=&#x2013;0.14 (Claude Opus-4.1) to &#x03BA;=0.18 (ChatGPT-4). The Cochran Q test found significant overall difference among models for AV block detection (<italic>P</italic>&#x003C;.001). <xref ref-type="table" rid="table2">Table 2</xref> summarizes diagnostic metrics for this task.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Measures regarding identification of first-degree atrioventricular block<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measures</td><td align="left" valign="bottom">OpenAI ChatGPT-4</td><td align="left" valign="bottom">OpenAI ChatGPT-5</td><td align="left" valign="bottom">Google Gemini 2.5</td><td align="left" valign="bottom">Microsoft Copilot</td><td align="left" valign="bottom">Claude Sonnet-4</td><td align="left" valign="bottom">Claude Opus-4.1</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic metric</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sensitivity, % (95% CI)</td><td align="left" valign="top">22.2 (6.3&#x2010;54.7)</td><td align="left" valign="top">0 (0&#x2010;29.9)</td><td align="left" valign="top">0 (0&#x2010;29.9)</td><td align="left" valign="top">0 (0&#x2010;29.9)</td><td align="left" valign="top">11.1 (2&#x2010;43.5)</td><td align="left" valign="top">0 (0&#x2010;29.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Specificity, % (95% CI)</td><td align="left" valign="top">93.4 (84.3&#x2010;97.4)</td><td align="left" valign="top">100 (94.1&#x2010;100)</td><td align="left" valign="top">90.2 (80.2&#x2010;95.4)</td><td align="left" valign="top">96.7 (88.8&#x2010;99.1)</td><td align="left" valign="top">98.4 (91.3&#x2010;99.7)</td><td align="left" valign="top">86.9 (76.2&#x2010;93.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PPV<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, % (95% CI)</td><td align="left" valign="top">33.3 (9.7&#x2010;70)</td><td align="left" valign="top">NA<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">0 (0&#x2010;39)</td><td align="left" valign="top">0 (0&#x2010;65.8)</td><td align="left" valign="top">50 (9.5&#x2010;90.5)</td><td align="left" valign="top">0 (0&#x2010;32.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NPV<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup>, % (95% CI)</td><td align="left" valign="top">89.1 (79.1&#x2010;94.6)</td><td align="left" valign="top">87.1 (77.3&#x2010;93.1)</td><td align="left" valign="top">85.9 (75.4&#x2010;92.4)</td><td align="left" valign="top">86.8 (76.7&#x2010;92.9)</td><td align="left" valign="top">88.2 (78.5&#x2010;93.9)</td><td align="left" valign="top">85.5 (74.7&#x2010;92.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy, % (95% CI)</td><td align="left" valign="top">84.3 (74-91)</td><td align="left" valign="top">87.1 (77.3&#x2010;93.1)</td><td align="left" valign="top">78.6 (67.6&#x2010;86.6)</td><td align="left" valign="top">84.3 (74-91)</td><td align="left" valign="top">87.1 (77.3&#x2010;93.1)</td><td align="left" valign="top">75.7 (64.5&#x2010;84.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Balanced accuracy, % (95% CI)</td><td align="left" valign="top">57.8 (45.2&#x2010;74.2)</td><td align="left" valign="top">50 (50&#x2010;50)</td><td align="left" valign="top">45.1 (41.1&#x2010;48.4)</td><td align="left" valign="top">48.4 (45.8&#x2010;50)</td><td align="left" valign="top">54.7 (47.8&#x2010;67.7)</td><td align="left" valign="top">43.4 (38.9&#x2010;47.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="top">26.7 (11.8&#x2010;57.1)</td><td align="left" valign="top">NA</td><td align="left" valign="top">NA</td><td align="left" valign="top">NA</td><td align="left" valign="top">18.2 (12.5&#x2010;53.3)</td><td align="left" valign="top">NA</td></tr><tr><td align="left" valign="top">Agreement measure</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03BA;<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.18</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2212;0.11</td><td align="left" valign="top">&#x2212;0.05</td><td align="left" valign="top">0.14</td><td align="left" valign="top"><bold>&#x2212;0.14</bold></td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Performance of generalist vision-enabled large language models in detecting first-degree atrioventricular block from 70 deidentified 12-lead electrocardiograms collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in July&#x2013;August 2025. Prevalence of first-degree atrioventricular block was 9/70 (13%). </p></fn><fn id="table2fn2"><p><sup>b</sup>PPV: positive predictive value.</p></fn><fn id="table2fn3"><p><sup>c</sup>NA: not applicable because no positives and/or no positive predictions. </p></fn><fn id="table2fn4"><p><sup>d</sup>NPV: negative predictive value.</p></fn><fn id="table2fn5"><p><sup>e</sup>Agreement with expert interpretation measured using Cohen &#x03BA;.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-7"><title>Identification and Classification of Intraventricular Conduction Blocks</title><p>Model accuracy in detecting and classifying these wide QRS patterns varied significantly (Cochran Q, <italic>P</italic>&#x003C;.001). ChatGPT-5 showed the highest performance metrics in this task, correctly identifying block in 15 cases and normal conduction in 34 cases (accuracy: 70%, balanced accuracy: 67.8%, <italic>F</italic><sub>1</sub>-score: 58.8%, and agreement with human raters: <italic>&#x03BA;</italic>=0.35). In contrast, ChatGPT-4 (accuracy: 35.7%, balanced accuracy: 46.4%, <italic>F</italic><sub>1</sub>-score: 48.3%, and agreement with human raters: &#x03BA;=&#x2212;0.05) classified almost all cases as abnormal, yielding a sensitivity of 84% (21/25 true positives) but 41 false positives, for a specificity of only 8.9%. The diagnostic metrics of all models in this task are presented in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Measures regarding identification of intraventricular conduction block<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measure</td><td align="left" valign="bottom">OpenAI ChatGPT-4</td><td align="left" valign="bottom">OpenAI ChatGPT-5</td><td align="left" valign="bottom">Google Gemini 2.5</td><td align="left" valign="bottom">Microsoft Copilot</td><td align="left" valign="bottom">Claude Sonnet-4</td><td align="left" valign="bottom">Claude Opus-4.1</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic metrics</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sensitivity, % (95% CI)</td><td align="left" valign="top">84<break/>(65.3&#x2010;93.6)</td><td align="left" valign="top">60<break/>(40.7&#x2010;76.6)</td><td align="left" valign="top">68<break/>(48.4&#x2010;82.8)</td><td align="left" valign="top">68<break/>(48.4&#x2010;82.8)</td><td align="left" valign="top">56<break/>(37.1&#x2010;73.3)</td><td align="left" valign="top">36<break/>(20.2&#x2010;55.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Specificity, % (95% CI)</td><td align="left" valign="top">8.9<break/>(3.5&#x2010;20.7)</td><td align="left" valign="top">75.6<break/>(61.3&#x2010;85.8)</td><td align="left" valign="top">57.8<break/>(43.3&#x2010;71)</td><td align="left" valign="top">31.1<break/>(19.5&#x2010;45.7)</td><td align="left" valign="top">46.7<break/>(32.9&#x2010;60.9)</td><td align="left" valign="top">73.3<break/>(59-84)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PPV<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>, % (95% CI)</td><td align="left" valign="top">33.9<break/>(23.3&#x2010;46.3)</td><td align="left" valign="top">57.7<break/>(38.9&#x2010;74.5)</td><td align="left" valign="top">47.2<break/>(32-63)</td><td align="left" valign="top">35.4<break/>(23.4&#x2010;49.6)</td><td align="left" valign="top">36.8<break/>(23.4&#x2010;52.7)</td><td align="left" valign="top">42.9<break/>(24.5&#x2010;63.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NPV<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup>, % (95% CI)</td><td align="left" valign="top">50<break/>(21.5&#x2010;78.5)</td><td align="left" valign="top">77.3<break/>(63&#x2010;87.2)</td><td align="left" valign="top">76.5<break/>(60&#x2010;87.6)</td><td align="left" valign="top">63.6<break/>(43&#x2010;80.3)</td><td align="left" valign="top">65.6<break/>(48.3&#x2010;79.6)</td><td align="left" valign="top">67.3<break/>(53.4&#x2010;78.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy, % (95% CI)</td><td align="left" valign="top">35.7<break/>(25.5&#x2010;47.4)</td><td align="left" valign="top">70<break/>(58.5&#x2010;79.5)</td><td align="left" valign="top">61.4<break/>(49.7&#x2010;72)</td><td align="left" valign="top">44.3<break/>(33.2&#x2010;55.9)</td><td align="left" valign="top">50<break/>(38.6&#x2010;61.4)</td><td align="left" valign="top">60<break/>(48.3&#x2010;70.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Balanced accuracy, % (95% CI)</td><td align="left" valign="top">46.4<break/>(37.6&#x2010;54.1)</td><td align="left" valign="top">67.8<break/>(55.6&#x2010;79)</td><td align="left" valign="top">62.9<break/>(50.1&#x2010;74.3)</td><td align="left" valign="top">49.6<break/>(37.8&#x2010;61.2)</td><td align="left" valign="top">51.3<break/>(39&#x2010;63.6)</td><td align="left" valign="top">54.7<break/>(43.4&#x2010;66.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="top">48.3<break/>(34.6&#x2010;61.1)</td><td align="left" valign="top">58.8<break/>(40.9&#x2010;73.3)</td><td align="left" valign="top">55.7<break/>(39.2&#x2010;69.6)</td><td align="left" valign="top">46.6<break/>(31.4&#x2010;61)</td><td align="left" valign="top">44.4<break/>(28.1&#x2010;59.4)</td><td align="left" valign="top">39.1<break/>(19.4&#x2010;56.6)</td></tr><tr><td align="left" valign="top">Agreement measure</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03BA;<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td><td align="left" valign="top">&#x2212;0.05</td><td align="left" valign="top">0.35</td><td align="left" valign="top">0.23</td><td align="left" valign="top">&#x2212;0.01</td><td align="left" valign="top">0.02</td><td align="left" valign="top">0.10</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Performance of generalist vision-enabled large language models in detecting intraventricular conduction block from 70 deidentified 12-lead electrocardiograms collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in July&#x2013;August 2025. Prevalence of intraventricular conduction block was 25/70 (36%).</p></fn><fn id="table3fn2"><p><sup>b</sup>PPV: positive predictive value.</p></fn><fn id="table3fn3"><p><sup>c</sup>NPV: negative predictive value. </p></fn><fn id="table3fn4"><p><sup>d</sup>Agreement with expert interpretation measured using Cohen &#x03BA;.</p></fn></table-wrap-foot></table-wrap><p>For the 25 cases with intraventricular conduction block, we further assessed whether models could identify the correct type. ChatGPT-4 and ChatGPT-5 performed best, each correctly classifying 11 (44%) of 25 cases. ChatGPT-4&#x2019;s result likely reflected its frequent labeling of intraventricular conduction block and frequent attempts at type assignment. No other model exceeded 7 correct classifications. Performance was particularly low for Claude Opus-4.1 (1/25, 4%) and Claude Sonnet-4 (2/25, 8%). Conditional accuracy (given that a block was correctly detected) was 60% for ChatGPT-4 and ChatGPT-5 but &#x2264;30% for all other models. Agreement with human experts was highest for ChatGPT-5 (Cohen &#x03BA;=0.37), followed by Gemini 2.5 (&#x03BA;=0.10), while all other models demonstrated very poor agreements (&#x03BA;&#x003C;.1).</p></sec><sec id="s3-8"><title>QTc Interval Prolongation</title><p>Prolonged QTc was present in 22 (31.4%) ECGs. Model accuracy for detecting QTc prolongation varied significantly (Cochran Q, <italic>P</italic>=.002). ChatGPT-5 and ChatGPT-4 (accuracies: 71.4% and 67.1%, balanced accuracies: 58.2% and 61.3%, <italic>F</italic><sub>1</sub>-scores: 33.3% and 46.5%, and agreement with human raters: &#x03BA;=0.20 and 0.23, respectively) demonstrated the highest metrics, while Gemini 2.5 had the lowest metrics (accuracy: 48.6%, balanced accuracy: 40.3%, <italic>F</italic><sub>1</sub>-score: 18.2%, and agreement with human raters: &#x03BA;=&#x2212;0.19). Most of the reported accuracy reflected correct classification of normal cases, with relatively high specificities but very low sensitivities, poor agreement with human raters, and classification performance as reflected by the detailed metrics shown in <xref ref-type="table" rid="table4">Table 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Measures regarding identification of corrected QT interval (QTc) prolongation<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measure</td><td align="left" valign="bottom">OpenAI ChatGPT-4</td><td align="left" valign="bottom">OpenAI ChatGPT-5</td><td align="left" valign="bottom">Google Gemini 2.5</td><td align="left" valign="bottom">Microsoft Copilot</td><td align="left" valign="bottom">Claude Sonnet-4</td><td align="left" valign="bottom">Claude Opus-4.1</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic metric</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sensitivity, % (95% CI)</td><td align="left" valign="top">45.5 (26.9&#x2010;65.3)</td><td align="left" valign="top">22.7 (10.1&#x2010;43.4)</td><td align="left" valign="top">18.2 (7.3&#x2010;38.5)</td><td align="left" valign="top">0 (0&#x2010;14.9)</td><td align="left" valign="top">0 (0&#x2010;14.9)</td><td align="left" valign="top">9.1 (2.5&#x2010;27.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Specificity, % (95% CI)</td><td align="left" valign="top">77.1 (63.5&#x2010;86.7)</td><td align="left" valign="top">93.8 (83.2&#x2010;97.9)</td><td align="left" valign="top">62.5 (48.4&#x2010;74.8)</td><td align="left" valign="top">100 (92.6&#x2010;100)</td><td align="left" valign="top">97.9 (89.1&#x2010;99.6)</td><td align="left" valign="top">91.7 (80.4&#x2010;96.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PPV<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>, % (95% CI)</td><td align="left" valign="top">47.6 (28.3&#x2010;67.6)</td><td align="left" valign="top">62.5 (30.6&#x2010;86.3)</td><td align="left" valign="top">18.2 (7.3&#x2010;38.5)</td><td align="left" valign="top">NA<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0 (0&#x2010;79.3)</td><td align="left" valign="top">33.3 (9.7&#x2010;70)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NPV<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup>, % (95% CI)</td><td align="left" valign="top">75.5 (61.9&#x2010;85.4)</td><td align="left" valign="top">72.6 (60.4&#x2010;82.1)</td><td align="left" valign="top">62.5 (48.4&#x2010;74.8)</td><td align="left" valign="top">68.6 (57&#x2010;78.2)</td><td align="left" valign="top">68.1 (56.4&#x2010;77.9)</td><td align="left" valign="top">68.8 (56.6&#x2010;78.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy, % (95% CI)</td><td align="left" valign="top">67.1 (55.5&#x2010;77)</td><td align="left" valign="top">71.4 (59.9&#x2010;80.7)</td><td align="left" valign="top">48.6 (37.2&#x2010;60)</td><td align="left" valign="top">68.6 (57&#x2010;78.2)</td><td align="left" valign="top">67.1 (55.5&#x2010;77)</td><td align="left" valign="top">65.7 (54&#x2010;75.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Balanced accuracy, % (95% CI)</td><td align="left" valign="top">61.3 (49.9&#x2010;73.2)</td><td align="left" valign="top">58.2 (49.3&#x2010;69)</td><td align="left" valign="top">40.3 (29.9&#x2010;51.5)</td><td align="left" valign="top">50 (50&#x2010;50)</td><td align="left" valign="top">49 (46.5&#x2010;50)</td><td align="left" valign="top">50.4 (43.9&#x2010;58.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="top">46.5 (27.3&#x2010;64)</td><td align="left" valign="top">33.3 (9.5&#x2010;54.5)</td><td align="left" valign="top">18.2 (4.9&#x2010;33.3)</td><td align="left" valign="top">NA</td><td align="left" valign="top">NA</td><td align="left" valign="top">14.3 (6.2&#x2010;33.3)</td></tr><tr><td align="left" valign="top">Agreement measure</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03BA;<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">0.23</td><td align="left" valign="top">0.20</td><td align="left" valign="top">&#x2212;0.19</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2212;0.03</td><td align="left" valign="top">0.01</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Performance of generalist vision-enabled large language models in detecting QTc prolongation from 70 deidentified 12-lead electrocardiograms collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in July&#x2013;August 2025. Prevalence of QTc prolongation was 22/70 (31%).</p></fn><fn id="table4fn2"><p><sup>b</sup>PPV: positive predictive value.</p></fn><fn id="table4fn3"><p><sup>c</sup>NA: not applicable because no positives and/or no positive predictions.</p></fn><fn id="table4fn4"><p><sup>d</sup>NPV: negative predictive value.</p></fn><fn id="table4fn5"><p><sup>e</sup>Agreement with expert interpretation measured using Cohen &#x03BA;.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-9"><title>Premature Atrial and Ventricular Contractions</title><p>Only 3 ECGs contained PACs. None of the generalist models correctly detected these occurrences. Apparent accuracy for PAC detection was high (94%&#x2010;96%) in most models, but this was largely driven by the predominance of true negatives. All models except ChatGPT-5 answered &#x201C;no&#x201D; for every case (majority class classifier behavior), correctly classifying the 67 negatives but missing all 3 positives. ChatGPT-5 produced one false positive. Agreement with human experts was uniformly poor, with all models showing Cohen &#x03BA;&#x2264;0. However, with only 3 PAC cases in the dataset, these estimates are unstable and do not permit reliable model-to-model comparison.</p><p>Ten ECGs contained at least one PVC. Sensitivity was again low. Only ChatGPT-5 detected any PVCs, correctly identifying 1 of 10 cases (sensitivity 10%) without false positives (specificity 100%). Gemini 2.5 produced one false positive, while most models (ChatGPT-4, Copilot, Claude Sonnet-4, and Claude Opus-4.1) answered &#x201C;no&#x201D; for all cases, yielding 100% specificity but 0% sensitivity. Cohen &#x03BA; was&#x2264;0 for all models except ChatGPT-5 (&#x03BA;=0.16). Calculated diagnostic metrics for PACs and PVCs are reported in Tables S8 and S9 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p></sec><sec id="s3-10"><title>ST-Segment Deviation</title><p>Significant ST-segment deviations were present in 14 ECGs. Gemini 2.5 flagged ST changes in 16 cases, identifying 3 true positives but generating 13 false positives, resulting in 65.7% accuracy, 49.1% balanced accuracy, and an <italic>F</italic><sub>1</sub>-score of 20%. In contrast, Copilot and Sonnet-4 were highly conservative, answering &#x201C;no&#x201D; in all cases (majority class classifier behavior) and yielding 80% accuracy. ChatGPT-4 detected 1 of 14 cases (sensitivity 7.1%) and Opus-4.1 had 0% sensitivity, although produced one false positive. ChatGPT-5 detected 2 of 14 cases (sensitivity 14.3%) with only one false positive, achieving the highest overall accuracy (81.4%) and balanced accuracy (56.2%) among generalist models. No model achieved sensitivity above 25%. Agreement with human experts was poor: all models showed &#x03BA;&#x2264;0 (below chance level), except ChatGPT-5 (&#x03BA;=0.18). All calculated diagnostic metrics for ST-segment deviations are provided in Table S10 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p></sec><sec id="s3-11"><title>QRS Axis Deviation</title><p>Axis deviations were present in 12 cases. ChatGPT-5 demonstrated the highest sensitivity (66.7%, correctly identifying 8 of 12 abnormal axes) but the lowest specificity (75.9%, 14 false positives among 58 normal cases), resulting in an overall accuracy of 74.3% (balanced accuracy: 71.3%; <italic>F</italic><sub>1</sub>-score=47.1%, and agreement with human experts: &#x03BA;=0.32). In contrast, Claude Sonnet-4 and Claude Opus-4.1 each produced only one false positive, achieving 81.4% accuracy (balanced accuracy: 49.1% and agreement with human experts: &#x03BA;=&#x2212;0.03). The Cochran Q test indicated no significant differences among models (<italic>P</italic>=.097). The diagnostic metrics of all generalist models on this task are shown in <xref ref-type="table" rid="table5">Table 5</xref>.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Measures regarding identification of QRS axis deviation<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measures</td><td align="left" valign="bottom">OpenAI ChatGPT-4</td><td align="left" valign="bottom">OpenAI ChatGPT-5</td><td align="left" valign="bottom">Google Gemini 2.5</td><td align="left" valign="bottom">Microsoft Copilot</td><td align="left" valign="bottom">Claude Sonnet-4</td><td align="left" valign="bottom">Claude Opus-4.1</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic metric</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sensitivity, % (95% CI)</td><td align="left" valign="top">25<break/>(8.9&#x2010;53.2)</td><td align="left" valign="top">66.7 (39.1&#x2010;86.2)</td><td align="left" valign="top">41.7 (19.3&#x2010;68)</td><td align="left" valign="top">16.7 (4.7&#x2010;44.8)</td><td align="left" valign="top">0 (0&#x2010;24.2)</td><td align="left" valign="top">0 (0&#x2010;24.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Specificity, % (95% CI)</td><td align="left" valign="top">89.7 (79.2&#x2010;95.2)</td><td align="left" valign="top">75.9 (63.5&#x2010;85)</td><td align="left" valign="top">86.2 (75.1&#x2010;92.8)</td><td align="left" valign="top">89.7 (79.2&#x2010;95.2)</td><td align="left" valign="top">98.3 (90.9&#x2010;99.7)</td><td align="left" valign="top">98.3 (90.9&#x2010;99.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PPV<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup>, % (95% CI)</td><td align="left" valign="top">33.3 (12.1&#x2010;64.6)</td><td align="left" valign="top">36.4 (19.7&#x2010;57)</td><td align="left" valign="top">38.5 (17.7&#x2010;64.5)</td><td align="left" valign="top">25 (7.1&#x2010;59.1)</td><td align="left" valign="top">0 (0&#x2010;79.3)</td><td align="left" valign="top">0 (0&#x2010;79.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NPV<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup>, % (95% CI)</td><td align="left" valign="top">85.2 (74.3&#x2010;92)</td><td align="left" valign="top">91.7 (80.4&#x2010;96.7)</td><td align="left" valign="top">87.7 (76.8&#x2010;93.9)</td><td align="left" valign="top">83.9 (72.8&#x2010;91)</td><td align="left" valign="top">82.6 (72&#x2010;89.8)</td><td align="left" valign="top">82.6 (72&#x2010;89.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy, % (95% CI)</td><td align="left" valign="top">78.6 (67.6&#x2010;86.6)</td><td align="left" valign="top">74.3 (63&#x2010;83.1)</td><td align="left" valign="top">78.6 (67.6&#x2010;86.6)</td><td align="left" valign="top">77.1 (66&#x2010;85.4)</td><td align="left" valign="top">81.4 (70.8&#x2010;88.8)</td><td align="left" valign="top">81.4 (70.8&#x2010;88.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Balanced accuracy, % (95% CI)</td><td align="left" valign="top">57.3 (44.4&#x2010;70.5)</td><td align="left" valign="top">71.3 (55.5&#x2010;85.3)</td><td align="left" valign="top">63.9 (49&#x2010;79.2)</td><td align="left" valign="top">53.2 (43&#x2010;65.7)</td><td align="left" valign="top">49.1 (47.3&#x2010;50)</td><td align="left" valign="top">49.1 (47.2&#x2010;50)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="top">28.6 (9.5&#x2010;52.6)</td><td align="left" valign="top">47.1 (23.1&#x2010;66.7)</td><td align="left" valign="top">40 (14.7&#x2010;63.2)</td><td align="left" valign="top">20 (8.3&#x2010;43.9)</td><td align="left" valign="top">NA<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup></td><td align="left" valign="top">NA</td></tr><tr><td align="left" valign="top">Agreement measure</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03BA;<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.16</td><td align="left" valign="top">0.32</td><td align="left" valign="top">0.27</td><td align="left" valign="top">0.07</td><td align="left" valign="top">&#x2212;0.03</td><td align="left" valign="top">&#x2212;0.03</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup> Performance of generalist vision-enabled large language models in detecting QRS axis deviation from 70 deidentified 12-lead electrocardiograms collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in July to August 2025. Prevalence of QRS axis deviation was 12/70 (17%). </p></fn><fn id="table5fn2"><p><sup>b</sup>PPV: positive predictive value.</p></fn><fn id="table5fn3"><p><sup>c</sup>NPV: negative predictive value.</p></fn><fn id="table5fn4"><p><sup>d</sup>NA: not applicable because no positives and/or no positive predictions. </p></fn><fn id="table5fn5"><p><sup>e</sup>Agreement with expert interpretation measured using Cohen &#x03BA;.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-12"><title>Safety Analysis: False-Negative Errors in High-Priority ECG Abnormalities</title><p>To complement aggregate performance metrics, we analyzed false-negative classifications, defined as cases in which an abnormal ECG finding was labeled as normal, in high-priority abnormalities. For ST-segment deviation (n=14), generalist models missed between 11 and 14 cases, corresponding to false-negative rates of 79% to 100%. For AF (n=9), most generalist models missed all 9 cases (100% false-negative rate), with only 2 models detecting 1 of 9 cases (89% false-negative rate). For QT prolongation (n=22), false-negative counts ranged from 12 to 22 cases (55% to 100%) across models, indicating substantial underdetection. Although the number of positive cases was limited, these findings indicate that models frequently failed to identify abnormalities with potential immediate clinical consequences and that acceptable overall accuracy often masked a substantial burden of missed pathological findings.</p></sec><sec id="s3-13"><title>Performance of ECG-Specialized LLMs</title><p>To also explore the performance of ECG-specialized LLMs on our dataset, we tested 2 further models: PULSE-7B and ECG-Instruct-Llama-3.2-11B-Vision (ECG-Instruct-Llama-3.2). PULSE-7B demonstrated higher overall balanced accuracy (64.4% vs 56.5%) and <italic>F</italic><sub>1</sub>-score (40.1% vs 32.9%) than ECG-Instruct-Llama-3.2 as detailed in<xref ref-type="table" rid="table6">Table 6</xref>. In paired analysis, discordant classifications numerically favored PULSE-7B (odds ratio [OR] 1.4, 95% CI 1.1&#x2010;1.8), although this asymmetry did not remain statistically significant after Bonferroni correction (adjusted <italic>P</italic>=.51). Among all generalist and specialized models, PULSE-7B achieved the highest overall balanced accuracy and <italic>F</italic><sub>1</sub>-score, exceeding the best-performing general model, ChatGPT-5 (64.4% and 40.1% vs 61.8% and 38%, respectively). Paired McNemar testing between ChatGPT-5 and PULSE-7B demonstrated statistically significant asymmetry in discordant classifications (OR 2, 95% CI 1.45&#x2010;2.81; adjusted <italic>P</italic>=.001), indicating that ChatGPT-5 and PULSE-7B differed systematically at the case level. When comparing models with the highest balanced accuracy per diagnostic categories between the generalist and specialized groups, the generalist models, mostly ChatGPT-5, showed numerically higher balanced accuracy than the specialized models in the classification of first-degree AV Block, IV Block, IV Block Type, QT prolongation, ischemic ST-segment changes, and axis deviation. Head-to-head McNemar testing showed variable effect sizes (ORs) and mostly no statistical significance after correcting for multiple testing as shown in Table S11 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Overall measures of electrocardiogram (ECG)&#x2013;specialized large language models<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Measures</td><td align="left" valign="bottom">PULSE-7B</td><td align="left" valign="bottom">PULSE-7B (SP<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup>)</td><td align="left" valign="bottom">ECG-Instruct-Llama-3.2</td><td align="left" valign="bottom">ECG-Instruct-Llama-3.2 (SP)</td></tr></thead><tbody><tr><td align="left" valign="top">Diagnostic metric</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Sensitivity, % (95% CI)</td><td align="left" valign="top">56.1 (47&#x2010;64.9)</td><td align="left" valign="top">60.6 (51.9&#x2010;68.7)</td><td align="left" valign="top">45.6 (37.1&#x2010;54.3)</td><td align="left" valign="top">42.1 (33.8&#x2010;50.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Specificity, % (95% CI)</td><td align="left" valign="top">72.7 (68.7&#x2010;76.3)</td><td align="left" valign="top">77.7 (73.9&#x2010;81.2)</td><td align="left" valign="top">67.3 (63.1&#x2010;71.3)</td><td align="left" valign="top">68.7 (64.5&#x2010;72.5)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>PPV<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup>, % (95% CI)</td><td align="left" valign="top">31.2 (25.3&#x2010;37.9)</td><td align="left" valign="top">40.7 (34&#x2010;47.9)</td><td align="left" valign="top">25.7 (20.4&#x2010;31.8)</td><td align="left" valign="top">25.1 (19.7&#x2010;31.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>NPV<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup>, % (95% CI)</td><td align="left" valign="top">88.2 (84.8&#x2010;91)</td><td align="left" valign="top">88.7 (85.4&#x2010;91.3)</td><td align="left" valign="top">83.3 (79.4&#x2010;86.6)</td><td align="left" valign="top">82.6 (78.7&#x2010;85.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Accuracy, % (95% CI)</td><td align="left" valign="top">69.7 (66&#x2010;73.1)</td><td align="left" valign="top">74.3 (70.7&#x2010;77.5)</td><td align="left" valign="top">63 (59.2&#x2010;66.7)</td><td align="left" valign="top">63.3 (59.5&#x2010;67)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Balanced accuracy, % (95% CI)</td><td align="left" valign="top">64.4 (59.6&#x2010;69.3)</td><td align="left" valign="top">69.2 (64.3&#x2010;73.8)</td><td align="left" valign="top">56.5 (51.4&#x2010;61.2)</td><td align="left" valign="top">55.4 (50.4&#x2010;60.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><italic>F</italic><sub>1</sub>-score (95% CI)</td><td align="left" valign="top">40.1 (33.1&#x2010;46.9)</td><td align="left" valign="top">48.7 (41.7&#x2010;55.3)</td><td align="left" valign="top">32.9 (26.6&#x2010;39)</td><td align="left" valign="top">31.5 (24.8&#x2010;37.6)</td></tr><tr><td align="left" valign="top">Agreement measure</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>&#x03BA;<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup></td><td align="left" valign="top">0.28</td><td align="left" valign="top">0.36</td><td align="left" valign="top">0.14</td><td align="left" valign="top">0.13</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Performance of image-based ECG-specialized vision-enabled large language models in interpreting 70 deidentified 12-lead ECGs collected during routine clinical care in a cardiology ward at the University Medical Center G&#x00F6;ttingen, Germany, with expert consensus as the reference standard. Model inference was conducted in January 2026. </p></fn><fn id="table6fn2"><p><sup>b</sup>SP: short-prompted.</p></fn><fn id="table6fn3"><p><sup>c</sup>PPV: positive predictive value.</p></fn><fn id="table6fn4"><p><sup>d</sup>NPV: negative predictive value.</p></fn><fn id="table6fn5"><p><sup>e</sup>Agreement with expert interpretation measured using Cohen &#x03BA;.</p></fn></table-wrap-foot></table-wrap><p>The use of an impression-style shortened prompt further increased the balanced accuracy and <italic>F</italic><sub>1</sub>-score of PULSE-7B to 69.2% and 48.7%, respectively. Although discordant classifications numerically differed between prompt formats (OR 0.7, 95% CI 0.5&#x2010;0.9), this effect was not statistically significant after correction for multiple comparisons (adjusted <italic>P</italic>&#x003E;.99). No prompt-related effect was observed for ECG-Instruct-Llama-3.2, where balanced accuracy and <italic>F</italic><sub>1</sub>-score were similar using the standard and shortened prompts (56.5% vs 55.4% and 32.9% vs 31.5%, respectively; OR 1.0, 95% CI 0.7&#x2010;1.3; adjusted <italic>P</italic>&#x003E;.99). When the short prompt was used for specialized models, they, mostly PULSE-7B, demonstrated numerically higher balanced accuracy than the generalist models in the classification of rhythm, first-degree AV Block, PACs, PVCs, and axis deviation as shown in Table S12 in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. However, this comparison remains limited because different prompts were used to test the models. Notably, the specialized models achieved consistently higher metrics regarding the detection of AF, with the short-prompted PULSE-7B reaching a balanced accuracy of 80.9% and an <italic>F</italic><sub>1</sub>-score of 66.7%; however, this finding is still limited by the small case numbers in the tested dataset (9/70). All diagnostic metrics regarding the detection of AF are reported in <xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>. When compared to a majority class classifier (ZeroR) across all categories, both standard-prompted ECG-specialized models had accuracy below or equal to ZeroR accuracy. In McNemar testing, no statistically significant asymmetry in discordant classifications was observed in all categories except for the detection of first-degree AV Block with PULSE-7B (OR 7.5, 95% CI 1.7&#x2010;32.8, adjusted <italic>P</italic>=.04) and ischemic ST-segment changes in ECG-Instruct-Llama-3.2 (OR 3.5, 95% CI 1.9&#x2010;6.6, adjusted <italic>P</italic>&#x003C;.001). The only model that had favorable and statistically significant asymmetry in discordant classifications in comparison with ZeroR was the short-prompted PULSE-7B in PVC detection (OR 0.1, 95% CI 0&#x2010;1, unadjusted <italic>P</italic>=.046), correlating with the highest balanced accuracy and <italic>F</italic><sub>1</sub>-score among all generalist and specialized models for this task (89.2% and 84.2%, respectively). However, this superiority did not survive correction for multiple testing (adjusted <italic>P=</italic>.455) as shown in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Summary of Findings</title><p>To the best of our knowledge, this is the first broad evaluation of the latest publicly available VE-LLMs on image-based ECG interpretation. We found substantial variability in performance and response characteristics across the 6 generalist models tested. Overall balanced accuracy ranged from 50.1% to 61.8%, with ChatGPT-5 achieving the highest overall balanced accuracy across the 9 ECG question categories. All generalist models correctly identified the majority class (normal SR or normal findings) in most cases but performed poorly on less common abnormalities. Sensitivity was particularly low for AF, conduction blocks, and ectopic beats, despite high specificity driven by a tendency toward normal classifications. This resulted in only poor or, at best, fair agreement with human expert answers on most tasks. The image-based ECG-specialized model, PULSE-7B, demonstrated slightly higher overall balanced accuracy than generalist models (64.4%). PULSE-7B showed numerically higher task-specific performance metrics for rhythm classification, identification of AF, and ectopic beats. However, this pattern was not consistent across other diagnostic categories (Summary Figure in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>). These findings should be interpreted as descriptive and hypothesis generating, given the small number of positive cases in multiple categories.</p></sec><sec id="s4-2"><title>Performance of Publicly Accessible Vision-Enabled LLMs in Image-Based ECG Analysis</title><p>Our findings align with and extend early investigations of multimodal LLMs for ECG analysis. A recently published study evaluated ChatGPT-4 on 6 ECG categories. With the zero-shot approach and no textual guidance, ChatGPT-4 achieved an accuracy of 53% in detecting abnormal ECGs, resulting mainly from a perfect sensitivity (100%), at the cost of a very poor specificity (7%). Providing textual guidance could boost the accuracy up to 63%. ChatGPT-4 also showed poor performance identifying specific pathologies even with textual guidance with an accuracy ranging from 28% to 41% [<xref ref-type="bibr" rid="ref13">13</xref>]. This difficulty in multidiagnosis interpretation mirrors our results, where no model reliably recognized all the diverse ECG abnormalities. Another study by Zeljkovic et al [<xref ref-type="bibr" rid="ref14">14</xref>] yielded comparable performance metrics; however, it demonstrated that clinical context can significantly improve accuracy of ChatGPT-4 interpretation of ECGs (from 19% to 45%). However, contextual information alone is unlikely to overcome intrinsic limitations of current vision encoders, particularly for tasks requiring precise measurements. Beyond ChatGPT-4, only very few studies investigated the performance of other generalist VE-LLMs in ECG interpretation; 2 comparative studies have shown that ChatGPT-4 outperformed Gemini, with both performing substantially below human experts [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. Our study is the first to investigate the state-of-the-art OpenAI model, ChatGPT-5, and to broadly compare the performance of generalist and 2 image-based ECG-specialized LLMs, PULSE-7B and ECG-Instruct-Llama-3.2. In their original reports, both specialist models demonstrated improved performance compared with general multimodal LLMs on ECG image benchmarks [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. In our external dataset, the overall performance metrics of PULSE-7B were slightly higher than generalist models; however, this superiority was rather modest and not consistent across all diagnostic categories. PULSE-7B achieved an overall balanced accuracy of 64.4%, which could be boosted to 69.2% with the impression-style concise prompt, whereas ECG-Instruct-Llama-3.2 reached approximately 56% overall balanced accuracy with limited improvement with prompt shortening. Notably, ECG-specialized models achieved the highest performance metrics in rhythm classification. Besides, the short-prompted PULSE-7B reached high balanced accuracy (up to 89.2%) and good agreement with expert annotations (up to 0.82) regarding the detection of ectopic beats. This may reflect its instruction-tuning on large ECG image corpora emphasizing visual pattern recognition of rhythm irregularity and beat morphology, rather than interval measurements that depend more strongly on precise signal decoding and fine-grained temporal resolution, which may be less robust in image-based inference pipelines.</p><p>In contrast to image-based LLMs, several deep learning CNNs trained specifically on ECG waveforms have already achieved expert-level performance [<xref ref-type="bibr" rid="ref16">16</xref>] and, in some cases, identified hidden patterns that are not apparent to human readers [<xref ref-type="bibr" rid="ref8">8</xref>]. For example, specialized ECG CNN models can predict AF in patients with SR [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. In our dataset and study settings, generalist VE-LLMs failed to reliably detect the presence of AF based on ECG images, with a task-specific advantage for specialized models, which reached a markedly higher balanced accuracy and <italic>F</italic><sub>1</sub>-scores for AF when compared with the generalist models. A recent head-to-head comparison found that a specialized ECG diagnostic model (&#x201C;ECG Buddy&#x201D;) significantly outperformed ChatGPT-4 in detecting acute myocardial infarction on ECGs [<xref ref-type="bibr" rid="ref15">15</xref>]. We did not observe a similar trend for ischemic ST-segment deviation in our study, where both generalist and specialized LLMs performed poorly and the highest balanced accuracy (56.2%) was achieved by ChatGPT-5. Although ECG-Instruct-Llama-3.2 scored the highest sensitivity (92.9%) in this task, this was at the cost of a very low specificity (17.9%), resulting from its tendency to label most presented ECGs as suspicious for ischemia. Conversely, many VE-LLMs achieved seemingly high accuracy in some subcategories, largely driven by class imbalance and correct classification of normal findings, as the majority class (ZeroR) baseline achieved higher overall accuracy than most models, suggesting potentially limited incremental value beyond predicting the most frequent class. However, this apparent advantage reflects class imbalance rather than meaningful diagnostic performance: because ZeroR always predicts the majority class, balanced accuracy remains at the chance level of 50%, and in categories where the majority class corresponds to normal findings the <italic>F</italic><sub>1</sub>-score collapses to 0, indicating no clinical utility for abnormality detection. Agreement of generalist models with human experts was consistently poor: &#x03BA; values were often near or below zero, reflecting little true concordance beyond chance. Taken together, these findings indicate that current general-purpose VE-LLMs remain markedly inferior to human experts for detailed ECG interpretation.</p></sec><sec id="s4-3"><title>Challenges of Current Vision-Enabled LLMs in ECG Interpretation</title><p>Several factors may explain why generalist cutting-edge LLMs underperformed in image-based ECG interpretation despite their strong language abilities. A key issue is training data. These models were trained on vast datasets that likely contained relatively few annotated ECG images, as such data are not abundant in the public domain compared with natural images or text. Consequently, their vision encoders may not have developed the specialized feature detectors required for precise waveform analysis, limiting their ability to capture the fine-grained features essential for ECG interpretation. Zhu et al [<xref ref-type="bibr" rid="ref10">10</xref>] reported that ChatGPT-4 could answer roughly two-thirds of multiple-choice ECG questions correctly, but it struggled disproportionately with questions requiring precise waveform measurements (eg, identifying a prolonged PR interval). We similarly found that first-degree AV block (prolonged PR) and QT prolongation were seldom detected by the tested models, possibly because ECG images, even when exported at high resolution, are internally downsampled or tokenized by vision language models, reducing fine waveform detail. As a result, precise measurement of interval durations (eg, PR or QT) may become technically challenging, particularly when interpretation depends on millimeter-scale grid resolution.</p><p>Although hallucinations were not systematically captured or quantified in this study, they remain a major safety issue for medical applications of LLMs. Pesapane et al [<xref ref-type="bibr" rid="ref17">17</xref>] documented multiple examples of ChatGPT-4 inventing abnormalities on normal mammograms. A recently published study that evaluated the performance of ChatGPT-4 and Gemini 1.5 in ECG interpretation with a focus on different types of hallucination revealed that even when LLMs answered correctly, hallucinations were still common [<xref ref-type="bibr" rid="ref18">18</xref>]. Thus, measuring LLM performance solely by the percentage of correct responses on multiple-choice tests may overestimate true interpretive ability.</p><p>The modest performance of ECG-specialized VE-LLMs in our evaluation compared with benchmark reports likely reflects several factors. Our study used real-world cardiology-ward ECG exports, whereas training and benchmark datasets included synthetically rendered ECG images derived from signal data and different cohorts. In addition, unlike prior reports, our zero-shot prompting approach required strict categorical decisions based on predefined measurement thresholds. Moreover, these comparisons are not strictly head-to-head, as prior studies used different datasets, label definitions, and evaluation protocols than our fixed 9-question rubric applied to real-world ECG image exports. Finally, the modest sample size, class imbalance, and low prevalence of certain abnormalities likely affected performance metrics.</p></sec><sec id="s4-4"><title>Study Limitations</title><p>This study has several limitations. First, the dataset was small (70 ECGs) and lacked rare but important findings such as ventricular tachycardia or advanced AV block. The study was not powered to detect predefined differences between models. A post-hoc minimum detectable difference analysis indicated that only absolute differences in overall accuracy &#x2265;22% could be detected with adequate statistical power. Smaller effect sizes may therefore not have been detectable, and findings should be interpreted as descriptive and hypothesis generating. In addition, the dataset was derived from a dedicated heart rhythm ward, where the pretest probability of arrhythmias and conduction abnormalities is higher than in general screening or emergency department populations. This case mix may have influenced predictive values and limits the generalizability of our findings to broader clinical settings and patient populations. Second, because all models were evaluated using a standardized zero-shot prompt, the reported results likely reflect baseline performance rather than maximum achievable capability. Advanced prompting strategies (eg, few-shot or chain-of-thought prompting) might have improved accuracy, as shown in previous research [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Third, the ECG-Instruct-Llama-3.2-11B-Vision model required manual extraction and mapping of diagnostic findings from narrative outputs; although done with a conservative approach, some degree of interpretation bias and misclassification cannot be fully excluded. In addition, output generation for the ECG-specialized models was limited to 512 new tokens (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), which may have truncated longer responses and reduced sensitivity if abnormalities were not included in the generated output. Fourth, some diagnostic categories were imbalanced in prevalence, which may have inflated accuracy metrics. Fifth, our evaluation was limited to English prompts; performance may differ in other languages or formats. Sixth, hallucinations were not systematically captured or quantified in this study; therefore, qualitative assessment of such errors was beyond the scope of the present analysis. Seventh, reference answers were defined by 2 cardiologists, which is reliable, although not infallible. We focused on well-quantifiable gross abnormalities, where careful measurement and consensus reading make the likelihood of missed findings very low. In addition, expert readers performed manual interval measurements using full-resolution ECG images, whereas vision-enabled multimodal models internally downsample and tokenize image inputs, creating an inherent asymmetry that may disadvantage models in interval-dependent tasks. Eighth, we did not include a conventional signal-based CNN benchmark; therefore, direct comparison with established waveform-based AI systems is not possible. Ninth, response time measurements were obtained via web interfaces and may therefore have been influenced by factors other than intrinsic model inference speed, such as server load, account tier, and time of day. In addition, the generalist web-based models were evaluated from July to August 2025, whereas the ECG-specialized models were tested in January 2026. As publicly deployed multimodal systems may undergo unannounced backend updates over time, comparisons across these evaluation windows should be interpreted cautiously. Finally, although we included the most advanced VE-LLMs at the time of study, these systems evolve rapidly, so our results may not generalize to future updates.</p></sec><sec id="s4-5"><title>Clinical Translation and Future Research Directions</title><p>Recent developments underscore both the promise and the challenge of combining LLM structures with dedicated ECG encoding and training. For example, Yang et al [<xref ref-type="bibr" rid="ref19">19</xref>] introduced ECG-LM as the first cross-modal LLM aligned with a dedicated ECG encoder, which achieved remarkable zero-shot results in ECG diagnostics and question-answering. In a very recent preprint, Xia et al [<xref ref-type="bibr" rid="ref20">20</xref>] have applied a different approach and proposed ECG-aBcDe, a framework that encodes ECGs into a universal representation consumable by LLMs. In this early evaluation, ECG-aBcDe demonstrated substantially improved zero-shot performance in ECG question-answering across datasets compared with prior approaches. Notably, these solutions are still largely based on ECG signal data. Effective translation of LLM technologies into routine clinical workflows requires a feasible and reliable image-based interpretation, a capability that has not yet been established. Beyond the specific model comparisons, the standardized evaluation pipeline presented here&#x2014;including structured prompting, predefined diagnostic tasks, and detailed reporting of inference settings&#x2014;may serve as a reproducible framework for benchmarking future vision-enabled LLMs as the field evolves.</p></sec><sec id="s4-6"><title>Conclusions</title><p>Our work provides a timely reality check on the capabilities of generalist multimodal LLMs in a core cardiology task, ECG interpretation. While their language abilities are remarkable, the performance of generalist LLMs in ECG interpretation is inconsistent across models and diagnostic categories and remains insufficient for safe clinical use. ChatGPT-5 achieved the highest overall accuracy among publicly accessible VE-LLMs, yet it too fell short of expert standards. The ECG-specialized model, PULSE-7B, demonstrated a slightly better overall performance and may offer advantages in selected tasks, but the performance is still inconsistent across the full range of diagnostic categories. Although promising, current evidence indicates that VE-LLMs require further optimization and validation before routine clinical integration.</p></sec></sec></body><back><ack><p>ChatGPT-5 and Gemini 2.5 were used to assist in drafting codes and improving grammar and writing style throughout the manuscript. In addition, large language model&#x2013;based tools were used to generate illustrative elements of the summary figure. All intellectual input and final editing were performed by the authors.</p></ack><notes><sec><title>Funding</title><p>The Department of Cardiology and Pneumology at the University Medical Center G&#x00F6;ttingen received support from the Stiftung Zukunft Niedersachsen (funding recipient: CS). In addition, the Department of Cardiology and Pneumology at the University Medical Center was funded by the German Centre for Cardiovascular Research (DZHK) at the Lower Saxony site and the BMBF for the ACRIBIS consortium. CS and FW are members of CRC 1550. The R codes for the statistical analyses are available in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref></p></sec><sec><title>Data Availability</title><p>The anonymized ground-truth label matrix is available in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. ECG images are not publicly available due to institutional regulations.</p></sec></notes><fn-group><fn fn-type="con"><p>NS contributed to conceptualization, methodology, data curation, formal analysis, and writing of the original draft. ER, HH, and FW contributed to investigation and critically reviewed and edited the manuscript. MZ supervised the study and contributed to manuscript review and editing. CS contributed to supervision, funding acquisition, and manuscript review and editing. RS contributed to conceptualization, software, formal analysis, visualization, and manuscript review and editing. LB contributed to conceptualization, supervision, project administration, and manuscript review and editing.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AF</term><def><p>atrial fibrillation</p></def></def-item><def-item><term id="abb2">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb3">AV</term><def><p>atrioventricular</p></def></def-item><def-item><term id="abb4">CNN</term><def><p>convolutional neural network</p></def></def-item><def-item><term id="abb5">ECG</term><def><p>electrocardiogram</p></def></def-item><def-item><term id="abb6">IVCD</term><def><p>intraventricular conduction delay</p></def></def-item><def-item><term id="abb7">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb8">NPV</term><def><p>negative predictive value</p></def></def-item><def-item><term id="abb9">OR</term><def><p>odds ratio</p></def></def-item><def-item><term id="abb10">PAC</term><def><p>premature atrial contraction</p></def></def-item><def-item><term id="abb11">PPV</term><def><p>positive predictive value</p></def></def-item><def-item><term id="abb12">PVC</term><def><p>premature ventricular contraction</p></def></def-item><def-item><term id="abb13">QTc</term><def><p>corrected QT interval</p></def></def-item><def-item><term id="abb14">SR</term><def><p>sinus rhythm</p></def></def-item><def-item><term id="abb15">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item><def-item><term id="abb16">VE-LLM</term><def><p>vision-enabled large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id><pub-id pub-id-type="medline">39779926</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>HK</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>HE</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>E</given-names> </name></person-group><article-title>Performance of ChatGPT-3.5 and GPT-4 in national licensing examinations for medicine, pharmacy, dentistry, and nursing: a systematic review and meta-analysis</article-title><source>BMC Med Educ</source><year>2024</year><month>09</month><day>16</day><volume>24</volume><issue>1</issue><fpage>1013</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05944-8</pub-id><pub-id pub-id-type="medline">39285377</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT across different versions in medical licensing examinations worldwide: systematic review and meta-analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>25</day><volume>26</volume><fpage>e60807</fpage><pub-id pub-id-type="doi">10.2196/60807</pub-id><pub-id pub-id-type="medline">39052324</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duggan</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Gervase</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schoenbaum</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Clinician experiences with ambient scribe technology to assist with documentation burden and efficiency</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2460637</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.60637</pub-id><pub-id pub-id-type="medline">39969880</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Berigan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Short</surname><given-names>R</given-names> </name><name name-style="western"><surname>Reisman</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The impact of large language model-generated radiology report summaries on patient comprehension: a randomized controlled trial</article-title><source>J Am Coll Radiol</source><year>2024</year><month>12</month><volume>21</volume><issue>12</issue><fpage>1898</fpage><lpage>1903</lpage><pub-id pub-id-type="doi">10.1016/j.jacr.2024.06.018</pub-id><pub-id pub-id-type="medline">38964446</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>N</given-names> </name><name name-style="western"><surname>Duffy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dhruva</surname><given-names>SS</given-names> </name><etal/></person-group><article-title>Deep learning of electrocardiograms in sinus rhythm from US veterans to predict atrial fibrillation</article-title><source>JAMA Cardiol</source><year>2023</year><month>12</month><day>1</day><volume>8</volume><issue>12</issue><fpage>1131</fpage><lpage>1139</lpage><pub-id pub-id-type="doi">10.1001/jamacardio.2023.3701</pub-id><pub-id pub-id-type="medline">37851434</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jabbour</surname><given-names>G</given-names> </name><name name-style="western"><surname>Nolin-Lapalme</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tastet</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Prediction of incident atrial fibrillation using deep learning, clinical models, and polygenic scores</article-title><source>Eur Heart J</source><year>2024</year><month>12</month><day>7</day><volume>45</volume><issue>46</issue><fpage>4920</fpage><lpage>4934</lpage><pub-id pub-id-type="doi">10.1093/eurheartj/ehae595</pub-id><pub-id pub-id-type="medline">39217446</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poterucha</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Jing</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ricart</surname><given-names>RP</given-names> </name><etal/></person-group><article-title>Detecting structural heart disease from electrocardiograms using AI</article-title><source>Nature</source><year>2025</year><month>08</month><volume>644</volume><issue>8075</issue><fpage>221</fpage><lpage>230</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-09227-0</pub-id><pub-id pub-id-type="medline">40670798</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>G&#x00FC;nay</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x00D6;zt&#x00FC;rk</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yi&#x011F;it</surname><given-names>Y</given-names> </name></person-group><article-title>The accuracy of Gemini, GPT-4, and GPT-4o in ECG analysis: a comparison with cardiologists and emergency medicine specialists</article-title><source>Am J Emerg Med</source><year>2024</year><month>10</month><volume>84</volume><fpage>68</fpage><lpage>73</lpage><pub-id pub-id-type="doi">10.1016/j.ajem.2024.07.043</pub-id><pub-id pub-id-type="medline">39096711</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mou</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Multimodal ChatGPT-4V for electrocardiogram interpretation: promise and limitations</article-title><source>J Med Internet Res</source><year>2024</year><month>06</month><day>26</day><volume>26</volume><fpage>e54607</fpage><pub-id pub-id-type="doi">10.2196/54607</pub-id><pub-id pub-id-type="medline">38764297</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mukkunnoth</surname><given-names>N</given-names> </name><name name-style="western"><surname>Mukkunnoth</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khapre</surname><given-names>M</given-names> </name><name name-style="western"><surname>Benamanahalli Puttaswamy</surname><given-names>S</given-names> </name></person-group><article-title>High-accuracy ECG image interpretation using parameter-efficient Low-Rank Adaptation (LoRA) fine-tuning with multimodal LLaMA V.3.2</article-title><source>BMJ Digit Health</source><year>2025</year><month>12</month><volume>1</volume><issue>1</issue><fpage>e000031</fpage><pub-id pub-id-type="doi">10.1136/bmjdhai-2025-000031</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yue</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name></person-group><article-title>Teach multimodal LLMs to comprehend electrocardiographic images</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 21, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.19008</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Engelstein</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ramon-Gonen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sabbag</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Effectiveness of the GPT-4o model in interpreting electrocardiogram images for cardiac diagnostics: diagnostic accuracy study</article-title><source>JMIR AI</source><year>2025</year><month>08</month><day>22</day><volume>4</volume><fpage>e74426</fpage><pub-id pub-id-type="doi">10.2196/74426</pub-id><pub-id pub-id-type="medline">40845836</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeljkovic</surname><given-names>I</given-names> </name><name name-style="western"><surname>Novak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lisicic</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Beyond text: the impact of clinical context on GPT-4&#x2019;s 12-lead electrocardiogram interpretation accuracy</article-title><source>Can J Cardiol</source><year>2025</year><month>07</month><volume>41</volume><issue>7</issue><fpage>1406</fpage><lpage>1414</lpage><pub-id pub-id-type="doi">10.1016/j.cjca.2025.01.036</pub-id><pub-id pub-id-type="medline">39971004</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yoo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name></person-group><article-title>Comparative diagnostic performance of a multimodal large language model versus a dedicated electrocardiogram AI in detecting myocardial infarction from electrocardiogram images: comparative study</article-title><source>JMIR AI</source><year>2025</year><month>09</month><day>17</day><volume>4</volume><fpage>e75910</fpage><pub-id pub-id-type="doi">10.2196/75910</pub-id><pub-id pub-id-type="medline">40961357</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ribeiro</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Ribeiro</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Paix&#x00E3;o</surname><given-names>GMM</given-names> </name><etal/></person-group><article-title>Automatic diagnosis of the 12-lead ECG using a deep neural network</article-title><source>Nat Commun</source><year>2020</year><month>04</month><day>9</day><volume>11</volume><issue>1</issue><fpage>1760</fpage><pub-id pub-id-type="doi">10.1038/s41467-020-15432-4</pub-id><pub-id pub-id-type="medline">32273514</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pesapane</surname><given-names>F</given-names> </name><name name-style="western"><surname>Nicosia</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rotili</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A preliminary investigation into the potential, pitfalls, and limitations of large language models for mammography interpretation</article-title><source>Discov Oncol</source><year>2025</year><month>02</month><day>24</day><volume>16</volume><issue>1</issue><fpage>233</fpage><pub-id pub-id-type="doi">10.1007/s12672-025-02005-4</pub-id><pub-id pub-id-type="medline">39992569</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Seki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kawazoe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ito</surname><given-names>H</given-names> </name><name name-style="western"><surname>Akagi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Takiguchi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ohe</surname><given-names>K</given-names> </name></person-group><article-title>Assessing the performance of zero-shot visual question answering in multimodal large language models for 12-lead ECG image interpretation</article-title><source>Front Cardiovasc Med</source><year>2025</year><volume>12</volume><fpage>1458289</fpage><pub-id pub-id-type="doi">10.3389/fcvm.2025.1458289</pub-id><pub-id pub-id-type="medline">39981353</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>ECG-LM: understanding electrocardiogram with a large language model</article-title><source>Health Data Sci</source><year>2025</year><volume>5</volume><issue>221</issue><fpage>0221</fpage><pub-id pub-id-type="doi">10.34133/hds.0221</pub-id><pub-id pub-id-type="medline">39906894</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xia</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>K</given-names> </name></person-group><article-title>ECG-aBcDe: overcoming model dependence, encoding ECG into a universal language for any large language model</article-title><source>Comput Biol Med</source><year>2026</year><month>02</month><day>1</day><volume>202</volume><fpage>111439</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.111439</pub-id><pub-id pub-id-type="medline">41483694</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Inference environment and settings used for the evaluated models.</p><media xlink:href="jmir_v28i1e86692_app1.docx" xlink:title="DOCX File, 32 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Example interaction with ChatGPT-5 demonstrating the model query and response process (screen recording).</p><media xlink:href="jmir_v28i1e86692_app2.mp4" xlink:title="MP4 File, 5163 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>R scripts used for data processing, statistical analyses, and generation of study figures</p><media xlink:href="jmir_v28i1e86692_app3.zip" xlink:title="ZIP File, 19 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Dataset matrix.</p><media xlink:href="jmir_v28i1e86692_app4.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Extended results including additional tables and figures detailing model performance and diagnostic metrics across evaluation tasks.</p><media xlink:href="jmir_v28i1e86692_app5.docx" xlink:title="DOCX File, 242 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Comparison of model predictions with the majority class classifier (ZeroR) across all diagnostic categories.</p><media xlink:href="jmir_v28i1e86692_app6.xlsx" xlink:title="XLSX File, 19 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Diagnostic performance metrics for atrial fibrillation detection across the evaluated models.</p><media xlink:href="jmir_v28i1e86692_app7.xlsx" xlink:title="XLSX File, 12 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Performance of 6 publicly accessible generalist and 2 ECG-specialized vision-enabled large language models (VE-LLMs) in image-based ECG interpretation. Seventy ECG tracings from cardiology ward patients were submitted to VE-LLMs using a standardized detailed prompt, in addition to a short prompt (SP) in case of specialized VE-LLMs. Models&#x2019; outputs were compared against expert consensus across 9 diagnostic categories. The Results section (center) contains 2 heatmaps: the balanced accuracy heatmap (left) showing the balanced accuracies, and the Cohen &#x03BA; heatmap (right) showing agreement with expert interpretation beyond chance level per model and diagnostic category.</p><media xlink:href="jmir_v28i1e86692_app8.png" xlink:title="PNG File, 5568 KB"/></supplementary-material></app-group></back></article>