<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e88834</article-id><article-id pub-id-type="doi">10.2196/88834</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Model Summarization of Physician-to-Physician Calls for Interhospital Transfer of Patients With ST-Elevation Myocardial Infarction: Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Wrenn</surname><given-names>Jesse O</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Behrens</surname><given-names>Madelaine</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hershey</surname><given-names>Mary S</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maldaver</surname><given-names>Marc</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mitchell</surname><given-names>John</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Thompson</surname><given-names>Trevor</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Triana</surname><given-names>Austin J</given-names></name><degrees>MBA, MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Virk</surname><given-names>Zain M</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Akdas</surname><given-names>Yasemin</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cauley</surname><given-names>Michael R</given-names></name><degrees>PhD, DMin</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ward</surname><given-names>Michael J</given-names></name><degrees>MBA, MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Monahan</surname><given-names>Ken</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Emergency Medicine, Vanderbilt University Medical Center</institution><addr-line>2215 Garland Ave, Light Hall Ste 203</addr-line><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff2"><institution>Department of Biomedical Informatics, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff3"><institution>Division of Emergency Medicine, VA Tennessee Valley Healthcare System</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff4"><institution>Department of Internal Medicine, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff5"><institution>Office of Community Health and Engagement, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff6"><institution>Geriatric Research, Education, and Clinical Center, VA Tennessee Valley Healthcare System</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><aff id="aff7"><institution>Division of Cardiovascular Medicine, Vanderbilt University Medical Center</institution><addr-line>Nashville</addr-line><addr-line>TN</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Dilip</surname><given-names>Monisha</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Shi</surname><given-names>Wen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jesse O Wrenn, MD, PhD, Department of Emergency Medicine, Vanderbilt University Medical Center, 2215 Garland Ave, Light Hall Ste 203, Nashville, TN, 37232, United States, 1 6153220160; <email>jesse.wrenn.1@vumc.org</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>25</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e88834</elocation-id><history><date date-type="received"><day>02</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>01</day><month>06</month><year>2026</year></date><date date-type="accepted"><day>02</day><month>06</month><year>2026</year></date></history><copyright-statement>&#x00A9; Jesse O Wrenn, Madelaine Behrens, Mary S Hershey, Marc Maldaver, John Mitchell, Trevor Thompson, Austin J Triana, Zain M Virk, Yasemin Akdas, Michael R Cauley, Michael J Ward, Ken Monahan. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 25.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e88834"/><abstract><sec><title>Background</title><p>Interhospital transfer of patients with suspected ST-elevation myocardial infarction (STEMI) requires timely and robust communication. Clinical uptake of potentially useful information from physician-to-physician phone calls authorizing transfer is low at many institutions, at least in part due to relative inaccessibility of call audio and lack of transcripts or summaries. Large language models (LLMs) can transform text into brief, consistently formatted summaries that could be made available in the electronic health record, thus facilitating the timely availability of clinically relevant data to physicians downstream of the transfer call.</p></sec><sec><title>Objective</title><p>We sought to assess the feasibility of using transcription and LLM summarization to provide written information summarizing transfer calls by adapting the Physician Documentation Quality Instrument (PDQI) to score generated call summaries and evaluate whether LLMs could effectively summarize a curated set of transfer calls.</p></sec><sec sec-type="methods"><title>Methods</title><p>STEMI transfer calls for which our institution was the receiving facility were transcribed and summarized by Whisper and ChatGPT (OpenAI), respectively. Each summary was reviewed by 2 of 7 independent physician raters. Summaries were rated using a Likert scale applied to an 8-domain framework adapted from the PDQI. We calculated summary statistics, including means, SDs, and raw and weighted agreement, and produced visual radar plots to demonstrate ratings for each call. We also performed thematic analysis of reviewers&#x2019; comments.</p></sec><sec sec-type="results"><title>Results</title><p>We identified 32 calls, of which 1 (3.1%) was excluded for incompleteness. Raw agreement between raters was 62% (153/248), and the mean of the pairwise weighted &#x03BA; coefficients was 0.19 (SD 0.30; slight agreement). The mean rating of all summaries across all domains was 4.6 of 5 (SD 0.7). The &#x201C;useful&#x201D; (mean 4.8/5, SD 0.5) and &#x201C;consistent&#x201D; (mean 4.9/5, SD 0.6) domains were the highest rated, and the &#x201C;thorough&#x201D; (mean 4.4/5, SD 1.0) and &#x201C;hallucination free&#x201D; (mean 4.4/5, SD 0.9) domains were the lowest rated. The mean score for accuracy was 4.6/5 (SD 0.7). Qualitative analysis found that raters penalized the LLM for inferential hallucinations, although these were often clinically accurate, and discrepancies related to calculation of timing of events.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Despite the limitations inherent in a small pilot cohort, this feasibility study suggests that LLMs can generate accurate and pertinent summaries of interhospital transfer calls for patients with STEMI. Interrater agreement was slight, which may suggest inadequate training of raters, unclear definitions, or a limitation of using the PDQI for this task. We identified several important areas for consideration prior to implementation, including thorough assessment of transcription accuracy, prompt engineering to minimize unwanted LLM behavior, and assessment of the impact of incorporating these summaries into clinical care on clinical outcomes.</p></sec></abstract><kwd-group><kwd>ST-elevation myocardial infarction</kwd><kwd>STEMI</kwd><kwd>patient transfer</kwd><kwd>large language models</kwd><kwd>summarization</kwd><kwd>delivery of health care</kwd><kwd>artificial intelligence</kwd><kwd>AI</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>ST-elevation myocardial infarction (STEMI) is a cardiovascular emergency that requires prompt access to the recommended reperfusion treatment, primary percutaneous coronary intervention (PCI), to achieve optimal outcomes. Approximately 61% of hospitals in the United States lack PCI capabilities [<xref ref-type="bibr" rid="ref1">1</xref>], requiring transfer of up to 50% of patients with STEMI [<xref ref-type="bibr" rid="ref2">2</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. Shorter time spent at the transferring hospital is significantly associated with shorter door-to-balloon times and improved mortality rates [<xref ref-type="bibr" rid="ref5">5</xref>]. The quality of communication between referring and receiving institutions has been identified as a major barrier to effective, timely transfer and, thus, may be a potential target for intervention [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>At Vanderbilt University Medical Center (VUMC) in middle Tennessee, patients are accepted for transfer during phone calls between referring physicians and VUMC cardiologists. Although digitally recorded, phone calls with <italic>accepting</italic> physicians, who make the decision to accept the patient for transfer, are not readily available for review by <italic>receiving</italic> physicians, who ultimately care for the patient on arrival and may differ from the accepting physician. In addition, the <italic>accepting</italic> physician may not have the opportunity to communicate directly with the <italic>receiving</italic> physician, leading to further degradation of information transfer. Thus, a durable near&#x2013;primary source account of the clinical details surrounding the transfer would assist in adding an element of continuity across the passage of time and transitions to different physicians.</p><p>Large language models (LLMs) may offer potential solutions by summarizing transfer calls into brief, consistently formatted, digestible text that could be made available in the electronic health record. There has been extensive study of digital scribes, or summarization of recorded conversations between physicians and patients, but less work in the domain of phone call transcription and summarization [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>]. A recent study demonstrates successful summarization of phone calls related to emergency department transfer; however, the process included intermediaries that corrected transcription mistakes and performed manual speaker diarization and labeling [<xref ref-type="bibr" rid="ref10">10</xref>], steps that may not be practically implemented in particularly time-sensitive conditions such as STEMI and may also not be scalable due to personnel and cost requirements. We sought to assess the feasibility of transcription and LLM summarization of STEMI transfer calls without human intervention using the Physician Documentation Quality Instrument (PDQI) [<xref ref-type="bibr" rid="ref11">11</xref>], adapted subsequently for evaluation of LLM-generated text [<xref ref-type="bibr" rid="ref12">12</xref>].</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This project was approved by our institutional review board with a quality improvement or nonresearch determination (240725). Therefore, individual consent was not required. All identifiable data were stored on encrypted devices within the secure institutional infrastructure. Participants were not compensated for inclusion.</p></sec><sec id="s2-2"><title>Data Acquisition</title><p>Our institution maintains a log of all patients admitted, transferred, or diagnosed in-house with STEMI. We used this log to identify all patients transferred to VUMC from an outside institution between January 1 and June 30, 2024, for whom there was a clinical concern for STEMI. The recordings of the transfer calls between associated referring physicians and accepting cardiologists were downloaded from an internal audio database (Encore; version 8.6; DVSAnalytics) and reviewed by a single cardiologist (KM) to ensure that they contained both the clinical information upon which the presumptive diagnosis of STEMI was made and the key decision-making discussion that informed the transfer.</p></sec><sec id="s2-3"><title>Transcription and Summarization</title><p>Each call was then transcribed using the OpenAI Whisper model with English language and large model size preselected, hosted locally within the secure institutional infrastructure. Transcripts were neither preprocessed nor cleaned prior to LLM analysis. Six transcripts were selected at random for assessment of word error rate (WER). Subtle surname misspellings were not counted as errors, and filler words were excluded from analysis. These verbatim transcripts were analyzed using aiChat, VUMC&#x2019;s HIPAA (Health Insurance Portability and Accountability Act)-compliant GPT-4o&#x2013;powered LLM. This software was chosen because it is readily available at VUMC and HIPAA compliant. Although prompt engineering [<xref ref-type="bibr" rid="ref13">13</xref>], the systematic process of drafting clear instructions for an LLM, can optimize performance, this was outside the scope of our project. The following prompt alone, based on prior qualitative work without any additional specific formatting requests, was used to generate each summary [<xref ref-type="bibr" rid="ref14">14</xref>]:</p><disp-quote><p>Please generate the following for the following conversation:</p><p>&#x201C;Summary: (provide the best possible summary according to my request)</p><p>Medications administered: (provide a list of all medications administered thus far)</p><p>Time: (provide the number of minutes since the chest pain started)</p><p>EKG findings: (provide the EKG findings related to the STEMI)</p><p>Labs: (provide any labs that have returned)</p><p>Vitals: (provide any vitals or information about hemodynamic stability)&#x201D;</p></disp-quote></sec><sec id="s2-4"><title>Quality Measurement</title><p>Audio and aiChat summaries were uploaded to REDCap (Research Electronic Data Capture; Vanderbilt University) [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] alongside a modified PDQI tool. Modifications to the PDQI included removal of the &#x201C;up-to-date&#x201D; and &#x201C;synthesized&#x201D; domains, which were deemed not applicable to this project. The adapted PDQI also included a question about whether the summary was &#x201C;free from hallucination,&#x201D; defined for the purpose of this study as containing information not verifiable in the call audio. This definition has been used previously for analysis of artificial intelligence (AI) tools [<xref ref-type="bibr" rid="ref12">12</xref>]. Three transfer calls from December 2022 were used as a pilot dataset to facilitate familiarity with the rating mechanism and identify ambiguities and/or problems with workflow. To measure the quality of transfer call summaries, each case was assigned to 2 of 7 participating second- and third-year internal medicine resident physicians. Raters were directly queried via email regarding their rotation history, which was verified by one author (KM) against the central schedule posted on the medical center&#x2019;s intranet. They provided the number of cardiac intensive care unit (CICU) rotations and non-CICU cardiology rotations they had completed at that point in their training. The same pair of raters evaluated no more than 2 transfer calls and were blinded to each other&#x2019;s scores but not to the study hypothesis. Each rater independently listened to the raw transfer call audio and rated the LLM-generated summary on each PDQI domain from 1 (&#x201C;not at all&#x201D;) to 5 (&#x201C;completely&#x201D;). Raters did not have access to the transcript. Assignments were made to maximally distribute rater pairings. Raters were given an opportunity to leave narrative comments to describe hallucinations, missing information, or other interesting findings.</p></sec><sec id="s2-5"><title>Statistical and Qualitative Analysis</title><p>We calculated and reported raw interrater reliability and pairwise weighted &#x03BA;. Interrater agreement was presented graphically as well as using radar charts. In addition, we reported mean and SD values for each measured domain. We reviewed rater comments from each of the measurements and conducted a thematic analysis. We also reviewed comments from a convenience sample of the 5 summaries with the largest difference between the two assigned raters&#x2019; scores across all domains and calculated the differences between raters in each domain.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Quantitative Results</title><p>We identified 32 transfer calls. Of these 32 calls, 1 (3.1%) was excluded from analysis due to the recording being incomplete, leaving a sample size of 31 (96.9%). The average WER for 6 evaluated calls was 1.6% (SD 0.4%), with a 1.4% (70/4865) total WER across the 6 sampled calls. Each call summary was rated on the 8 PDQI domains by 2 raters for a total of 496 ratings or 248 total pairs. At the time of review, the 7 residents rating the summaries had completed an average of 2.4 (SD 1.7) CICU rotations and 3.4 (SD 0.9) non-CICU cardiology rotations. Each rater evaluated 8 or 9 summaries. Of the paired ratings, 62% (153/248) matched exactly between both raters, pairwise weighted &#x03BA; coefficients ranged from &#x2212;0.23 (no agreement) to 1 (perfect agreement), and the mean of the pairwise weighted &#x03BA; coefficients was 0.19 (SD 0.30; slight agreement). <xref ref-type="fig" rid="figure1">Figure 1</xref> shows radar plots of the raters&#x2019; scores for each reviewed summary document and displays visually the overall degree of scoring overlap among the rater pairs.</p><p><xref ref-type="table" rid="table1">Table 1</xref> provides a summary of ratings by PDQI domain. No summary was rated perfectly by both raters, but there was perfect agreement between raters on summaries 6 and 27. For each of the 8 domains, the mean score exceeded 4 on the 5-point Likert scale. The mean score for all ratings across all domains was 4.6 (SD 0.7).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Radar plot representation of interrater variability for each call summary. For a given radar plot, each vertex of the octagon represents one of the adapted Physician Documentation Quality Instrument domains, as depicted in the legend. For each summary, one rater&#x2019;s scores are shaded in red, and the other rater&#x2019;s scores are shaded in blue. The overlap between the 2 reviewers is indicated in purple. For example, raters agreed perfectly on the summaries of calls 6 and 27 and disagreed substantially on the summaries of calls 5, 14, 19, 25, and 31.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88834_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Summary statistics by Physician Documentation Quality Instrument domain.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Domain</td><td align="left" valign="bottom">Score (1-5), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Accurate</td><td align="left" valign="top">4.6 (0.7)</td></tr><tr><td align="left" valign="top">Thorough</td><td align="left" valign="top">4.4 (1.0)</td></tr><tr><td align="left" valign="top">Useful</td><td align="left" valign="top">4.8 (0.5)</td></tr><tr><td align="left" valign="top">Organized</td><td align="left" valign="top">4.6 (0.9)</td></tr><tr><td align="left" valign="top">Comprehensible</td><td align="left" valign="top">4.8 (0.4)</td></tr><tr><td align="left" valign="top">Succinct</td><td align="left" valign="top">4.6 (0.7)</td></tr><tr><td align="left" valign="top">Consistent</td><td align="left" valign="top">4.9 (0.6)</td></tr><tr><td align="left" valign="top">Hallucination free</td><td align="left" valign="top">4.4 (0.9)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Clinically valid inferences by the large language model were penalized by raters as hallucinations as the inferences were not directly verifiable in the call audio, so the mean score in the &#x201C;hallucination free&#x201D; domain should be interpreted with caution.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Qualitative Results</title><p>The descriptive comments by raters identified hallucinations, which often took the form of inferences made by the LLM. For multiple patients, the LLM inferred a patient&#x2019;s hemodynamic stability from the vital signs provided or from a description of the patient&#x2019;s appearance as pale or diaphoretic. Although these inferences were not reported as inaccurate, they were not explicitly mentioned in the calls, which prompted the raters to label them as hallucinations. For example, raters noted that the LLM inferred hepatic encephalopathy as a diagnosis (unstated in the call) from the combination of altered mental status and a history of cirrhosis (both stated in the call).</p><p>Noninferential inaccuracies were uncommon. However, on several occasions, when the referring physician voiced that a medication was considered or ready to be given, it was cited as given by the LLM. Vital signs were mentioned as misreported once. The raters described more instances of omission of details that they believed would have been useful to the receiving team, including laboratory tests, relevant history, and examination findings. However, raters often praised the summaries for filtering out parts of the conversation that they judged as not important to the receiving team.</p><p>Raters identified that the LLM appeared to have difficulty with both absolute and relative time. In total, 6.5% (2/31) of the summaries had events listed in the wrong sequence. Several summaries (7/31, 23%) contained inferences regarding the time of a call that was not explicitly mentioned, both absolute clock time and with respect to the time since the chest pain started, the latter of which was explicitly requested by the prompt.</p><p>Interrater agreement calculated using the mean of the pairwise weighted &#x03BA; coefficients was slight at 0.19 (SD 0.30). A total of 16.1% (5/31) of the summaries reviewed (5, 14, 19, 25, and 31) had notably worse agreement than the others. One specific rater was found to have given the lower average rating in 4 of these 5 summaries and was also a rater on the fifth (summary 19), which had the same average rating between the 2 raters but substantial differences in domain scores. This rater was very thorough with their comments, and most often referred to missing information that the rater believed could alter management, inaccuracies regarding assessment of timing, and inferential hallucinations. In these 5 calls, differences between ratings were the largest in the &#x201C;thorough,&#x201D; &#x201C;hallucination free,&#x201D; and &#x201C;organized&#x201D; domains.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><sec id="s4-1-1"><title>Overview</title><p>This feasibility study explored the use of LLMs to summarize phone calls between referring and accepting physicians in the setting of emergency care for patients with suspected STEMI and evaluated this process using standardized assessment tools. With the caveat that mean scores were derived from ratings with only slight agreement, the high (adapted) PDQI score suggests that the LLM-based approach yielded summaries that were accurate and reproducible. Transfer call summaries, particularly when compared to complete opacity, may represent a transformative solution to address data loss during interhospital transfers. This work demonstrates the feasibility of using an LLM to summarize transcripts of STEMI transfer calls and provides a foundation for further investigation of automated summarization of call audio in patient transfers.</p></sec><sec id="s4-1-2"><title>Call Summary Quality</title><p>No PDQI domain received ratings under 4/5 when averaged over the summaries. The &#x201C;thorough&#x201D; and &#x201C;hallucination free&#x201D; domains had the lowest scores of the 8 measured domains. Whether a lower score in the &#x201C;thorough&#x201D; domain represents the LLM system behaving as designed given inherent limitations of how thorough a summary can be or reflects true information loss is unclear as this was not addressed in the comments left by raters. The distinction is important as redesigning the prompt could generate more thorough, albeit potentially longer summaries. However, the consistently high ratings for the &#x201C;useful&#x201D; domain are reassuring and suggest that the lower scores in the &#x201C;thorough&#x201D; domain are less likely to be due to diminished transfer of high-yield information.</p><p>We found that there were 2 types of hallucinations referred to in the comments regarding the &#x201C;hallucination free&#x201D; domain. The first was fabrication, which could clearly cause harm if patients&#x2019; management was changed due to inaccurate information. The second was clinical inference (such as inferring hemodynamic instability from words such as &#x201C;pale&#x201D; or &#x201C;diaphoretic&#x201D;), which was considered reasonable and valid by reviewers but was still labeled as a hallucination because the information was not explicitly verifiable via the call audio. In this initial study, the &#x201C;hallucination free&#x201D; rating could represent the worst-case scenario or upper bound, but future work might benefit from parsing further the definition of the hallucination domain by including fabrication and valid inference as 2 separate domains.</p><p>Although relatively uncommon, fabrications could potentially adversely influence patient care, particularly if there are no associated primary data included in the summary from which receiving physicians can draw their own conclusions (ie, a receiving physician may interpret vital signs differently from the LLM, but there is no analogous corroborating evidence to support whether a medication was given). Furthermore, if the LLM reported a categorical assessment of a variable such as vital signs and the primary data were not included in the summary, a miscategorized vital sign may prompt misclassification. However, there are strategies for mitigation of LLM hallucinations that were not within the scope of this project [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>That the LLM had difficulty with the concept of time since the chest pain started is not surprising. Although LLMs have recently achieved significant advances in mathematical reasoning, this remains a challenging frontier in AI research [<xref ref-type="bibr" rid="ref18">18</xref>]. The prompt we designed requested the number of minutes since the chest pain started, which may require knowledge of the time of the call, the time of onset of chest pain, and the ability to reason mathematically. An alteration of the prompt to request any relevant information from the call on the time when symptoms started as opposed to &#x201C;the number of minutes since chest pain started&#x201D; might improve the summary&#x2019;s accuracy with respect to the timing of relevant events.</p><p>There are several additional issues that are important to consider prior to implementation. The average WER was 1.6% (SD 0.4%) for 6 randomly selected transcripts, which is reassuring; however, we did not thoroughly assess the conceptual accuracy of all transcripts, nor did we analyze common errors of transcription. Propagated errors of transcription may have degraded the quality of the summary. Assurance that the transcription model includes medical terminology and drug names is important, as is thorough assessment of transcription quality. Further prompt engineering may reduce the incidence of hallucinations, particularly with respect to the timing of events, including the onset of chest pain. For example, updating the prompt to request only timing that was explicitly discussed in the call may be helpful. Finally, to engender confidence, it may be reasonable to request quotations from the transcript so that physicians can look back to the primary data to confirm critical information.</p></sec></sec><sec id="s4-2"><title>Limitations</title><p>There were several limitations to this study. The transcription software and LLM were chosen for convenience by the study team, and alternative software or alternate versions could produce different results. The sample size was small, which could limit overall generalizability, although the patient population was relatively homogenous (ie, patients being transferred to a tertiary care center due to concern about STEMI). The selection process included information-rich calls that contained complete clinical and decision-making information, which introduced selection bias. Performance on a randomly selected sample of calls would improve the generalizability of the study to the broader clinical environment, although using an enriched dataset for this feasibility study is a reasonable initial step.</p><p>Lack of a comparator such as human expert&#x2013;generated summaries limits interpretability of the summary quality scores, and the study&#x2019;s retrospective design limits our ability to understand the clinical adequacy of the summaries and downstream effects of summarization on patient care. A prospective study measuring the impact of incorporation of summaries into clinical workflow on patient outcomes (such as morbidity, mortality, time in the intensive care unit, and overall length of stay) compared to an expert-generated comparator could address some of these design limitations.</p><p>Although we did not test different prompts, the prompt used in this study was engineered to optimize the inclusion of key objective data while generating a summary that could be consumed efficiently by a busy clinician. Future work could address this limitation by having receiving physicians evaluate the overall utility of summaries generated using different prompts.</p><p>The second- and third-year internal medicine residents reviewing the summaries did not evaluate them from the perspective of interventional or critical care cardiologists; however, they had enough exposure to CICU and general cardiology rotations in their training to reasonably make assessments of summary quality. Furthermore, they are often the intended beneficiaries of call summaries (ie, the first-line recipients of post-PCI patients with STEMI). That said, their relative inexperience could lead to different assessments of the importance of certain elements of the call from those of a more experienced clinician, particularly for the &#x201C;useful&#x201D; and &#x201C;thorough&#x201D; domains, each of which relies on the communication of information deemed important. Although blinded to the scores of others, raters were not blinded to the LLM-generated source of the summaries, which could introduce rater bias and artificially inflate or deflate scores. They were also not blinded to the study hypothesis, which could introduce confirmation bias as raters could subconsciously inflate scores to align with the goals of the study.</p><p>Although infrequent, there were a few substantial disagreements between raters on the quality of summaries, with slight weighted agreement. The reported mean scores for each of the domains must be interpreted with caution given discordant ratings. It is also important to note that the observed slight agreement may partially reflect a statistical ceiling effect as 74.8% (371/496) of the total ratings were 5/5, and &#x03BA; can underestimate agreement when ratings cluster within a narrow range and chance agreement is inflated. It is unclear whether the remaining disagreement represents inadequate training of raters, unclear definitions, or a limitation of using the PDQI for this task. That one particular rater often provided lower ratings than their counterparts may reflect lenience on the part of the other raters, particularly in light of the possibility of confirmation bias resulting from the fact that raters were unblinded to the study hypothesis. Prior to deployment in clinical practice, in addition to larger transfer call sample sizes and proportionally larger numbers of summary raters, follow-up studies could include additional raters per summary to more robustly characterize the frequency and magnitude of outliers in the domain of interrater disparities.</p><p>The original 9-item PDQI (PDQI-9) was developed and validated for the evaluation of notes across 9 criteria. At the conception of this study, we identified an unvalidated adaptation of the PDQI-9 that assessed the quality of LLM-generated text. Since then, there has been a validated evaluation of similar adaptations of the PDQI-9 for LLM-generated text that include the addition of the &#x201C;hallucination free&#x201D; domain and the removal of the &#x201C;up-to-date&#x201D; domain [<xref ref-type="bibr" rid="ref19">19</xref>]. There has also been validation of a similar adapted PDQI-9 [<xref ref-type="bibr" rid="ref20">20</xref>] for LLM-generated summaries that removed the &#x201C;up-to-date&#x201D; and &#x201C;consistent&#x201D; domains and included the assessment of hallucination in the &#x201C;accuracy&#x201D; domain. Notably, the latter defined hallucinations as falsifications (information or data distorted from the original note) or fabrications (made-up information or data that could be plausible but based on nonexistent facts) present in the summary. Regardless, the use of an unvalidated altered instrument may undermine the reliability of the scores to measure true summary quality.</p><p>To make this type of automated call summary tool usable in day-to-day clinical practice, the individual components (ie, call transcription and LLM analysis) need to be integrated such that summary generation is transparent to the summary recipient. In addition, the summary would need to be uploaded to the patient&#x2019;s chart seamlessly, ideally without any incremental effort from the accepting or receiving clinician. Although implementation would require IT support, the use of ambient AI to document outpatient clinic visits [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>] suggests that this objective is readily achievable in the tertiary medical center environment.</p></sec><sec id="s4-3"><title>Conclusions</title><p>In a feasibility study, we demonstrated that transcription and LLM summarization can provide information about the content of STEMI transfer calls between institutions and, thus, leveraged primary data that are routinely collected but not optimally used. The use of an adapted PDQI has notable limitations that must be addressed to more accurately assess summary quality, and mean scores must be interpreted cautiously given slight interrater agreement; however, on a subset of curated information-rich transfer calls, summaries were demonstrated to be useful and accurate. Given the risks associated with physician handoffs [<xref ref-type="bibr" rid="ref23">23</xref>], our results emphasize LLMs&#x2019; potential as a transformative tool to address data loss during interhospital transfers, particularly when compared to the current alternative of total information loss.</p></sec></sec></body><back><ack><p>The authors would like to thank Wendy Daigle, RN, MSN, CCRN, for providing the ST-elevation myocardial infarction log and Dana Wilson, RN, MBA, MMHC, NEA-BC, for facilitating access to the digital phone call database.</p></ack><notes><sec><title>Funding</title><p>JOW and MRC were supported by the National Heart, Lung, and Blood Institute of the National Institutes of Health under award T32HL170986.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are not publicly available due to the impracticality of redacting protected health information from audio files, transcripts, and summary documents but are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: JOW, YA, MRC, KM, MJW</p><p>Data curation: JOW, MB, MSH, MM, JM, TT, AJT, ZMV</p><p>Formal analysis: JOW</p><p>Supervision: KM, MJW</p><p>Writing&#x2014;original draft: JOW</p><p>Writing&#x2014;review and editing: JOW, MB, MSH, MM, JM, TT, AJT, ZMV, YA, MRC, KM, MJW</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CICU</term><def><p>cardiac intensive care unit</p></def></def-item><def-item><term id="abb3">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">PCI</term><def><p>percutaneous coronary intervention</p></def></def-item><def-item><term id="abb6">PDQI</term><def><p>Physician Documentation Quality Instrument</p></def></def-item><def-item><term id="abb7">PDQI-9</term><def><p>9-item Physician Documentation Quality Instrument</p></def></def-item><def-item><term id="abb8">REDCap</term><def><p>Research Electronic Data Capture</p></def></def-item><def-item><term id="abb9">STEMI</term><def><p>ST-elevation myocardial infarction</p></def></def-item><def-item><term id="abb10">VUMC</term><def><p>Vanderbilt University Medical Center</p></def></def-item><def-item><term id="abb11">WER</term><def><p>word error rate</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Langabeer</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Henry</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Kereiakes</surname><given-names>DJ</given-names> </name><etal/></person-group><article-title>Growth in percutaneous coronary intervention capacity relative to population and disease prevalence</article-title><source>J Am Heart Assoc</source><year>2013</year><month>10</month><day>28</day><volume>2</volume><issue>6</issue><fpage>e000370</fpage><pub-id pub-id-type="doi">10.1161/JAHA.113.000370</pub-id><pub-id pub-id-type="medline">24166491</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chakrabarti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Krumholz</surname><given-names>HM</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Rumsfeld</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Nallamothu</surname><given-names>BK</given-names> </name><collab>National Cardiovascular Data Registry</collab></person-group><article-title>Time-to-reperfusion in patients undergoing interhospital transfer for primary percutaneous coronary intervention in the US: an analysis of 2005 and 2006 data from the National Cardiovascular Data Registry</article-title><source>J Am Coll Cardiol</source><year>2008</year><month>06</month><day>24</day><volume>51</volume><issue>25</issue><fpage>2442</fpage><lpage>2443</lpage><pub-id pub-id-type="doi">10.1016/j.jacc.2008.02.071</pub-id><pub-id pub-id-type="medline">18565404</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jollis</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Al-Khalidi</surname><given-names>HR</given-names> </name><name name-style="western"><surname>Monk</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Expansion of a regional ST-segment-elevation myocardial infarction system to an entire state</article-title><source>Circulation</source><year>2012</year><month>07</month><day>10</day><volume>126</volume><issue>2</issue><fpage>189</fpage><lpage>195</lpage><pub-id pub-id-type="doi">10.1161/CIRCULATIONAHA.111.068049</pub-id><pub-id pub-id-type="medline">22665718</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jollis</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Roettig</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Aluko</surname><given-names>AO</given-names> </name><etal/></person-group><article-title>Implementation of a statewide system for coronary reperfusion for ST-segment elevation myocardial infarction</article-title><source>JAMA</source><year>2007</year><month>11</month><day>28</day><volume>298</volume><issue>20</issue><fpage>2371</fpage><lpage>2380</lpage><pub-id pub-id-type="doi">10.1001/jama.298.20.joc70124</pub-id><pub-id pub-id-type="medline">17982184</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Nallamothu</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Krumholz</surname><given-names>HM</given-names> </name><etal/></person-group><article-title>Association of door-in to door-out time with reperfusion delays and outcomes among patients transferred for primary percutaneous coronary intervention</article-title><source>JAMA</source><year>2011</year><month>06</month><day>22</day><volume>305</volume><issue>24</issue><fpage>2540</fpage><lpage>2547</lpage><pub-id pub-id-type="doi">10.1001/jama.2011.862</pub-id><pub-id pub-id-type="medline">21693742</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ward</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Vogus</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Bonnet</surname><given-names>K</given-names> </name><name name-style="western"><surname>Moser</surname><given-names>K</given-names> </name><name name-style="western"><surname>Schlundt</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kripalani</surname><given-names>S</given-names> </name></person-group><article-title>Breaking down walls: a qualitative evaluation of perceived emergency department delays for patients transferred with ST-elevation myocardial infarction</article-title><source>BMC Emerg Med</source><year>2020</year><month>08</month><day>6</day><volume>20</volume><issue>1</issue><fpage>60</fpage><pub-id pub-id-type="doi">10.1186/s12873-020-00355-6</pub-id><pub-id pub-id-type="medline">32762657</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ha</surname><given-names>E</given-names> </name><name name-style="western"><surname>Choon-Kon-Yune</surname><given-names>I</given-names> </name><name name-style="western"><surname>Murray</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Evaluating the usability, technical performance, and accuracy of artificial intelligence scribes for primary care: competitive analysis</article-title><source>JMIR Hum Factors</source><year>2025</year><month>07</month><day>23</day><volume>12</volume><fpage>e71434</fpage><pub-id pub-id-type="doi">10.2196/71434</pub-id><pub-id pub-id-type="medline">40700466</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Buchem</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Boosman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bauer</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Kant</surname><given-names>IM</given-names> </name><name name-style="western"><surname>Cammel</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name></person-group><article-title>The digital scribe in clinical practice: a scoping review and research agenda</article-title><source>NPJ Digit Med</source><year>2021</year><month>03</month><day>26</day><volume>4</volume><issue>1</issue><fpage>57</fpage><pub-id pub-id-type="doi">10.1038/s41746-021-00432-5</pub-id><pub-id pub-id-type="medline">33772070</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duggan</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Gervase</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schoenbaum</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Clinician experiences with ambient scribe technology to assist with documentation burden and efficiency</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2460637</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.60637</pub-id><pub-id pub-id-type="medline">39969880</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sezgin</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sirrianni</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Kranz</surname><given-names>K</given-names> </name></person-group><article-title>Evaluation of a digital scribe: conversation summarization for emergency department consultation calls</article-title><source>Appl Clin Inform</source><year>2024</year><month>05</month><day>15</day><volume>15</volume><issue>3</issue><fpage>600</fpage><lpage>611</lpage><pub-id pub-id-type="doi">10.1055/a-2327-4121</pub-id><pub-id pub-id-type="medline">38749477</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stetson</surname><given-names>PD</given-names> </name><name name-style="western"><surname>Bakken</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wrenn</surname><given-names>JO</given-names> </name><name name-style="western"><surname>Siegler</surname><given-names>EL</given-names> </name></person-group><article-title>Assessing electronic note quality using the Physician Documentation Quality Instrument (PDQI-9)</article-title><source>Appl Clin Inform</source><year>2012</year><volume>3</volume><issue>2</issue><fpage>164</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.4338/aci-2011-11-ra-0070</pub-id><pub-id pub-id-type="medline">22577483</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tierney</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Gayre</surname><given-names>G</given-names> </name><name name-style="western"><surname>Hoberman</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Ambient artificial intelligence scribes to alleviate the burden of clinical documentation</article-title><source>NEJM Catal Innov Care Deliv</source><year>2024</year><month>02</month><day>21</day><volume>5</volume><issue>3</issue><pub-id pub-id-type="doi">10.1056/CAT.23.0404</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>W</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Hayashi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Neubig</surname><given-names>G</given-names> </name></person-group><article-title>Pre-train, prompt, and predict: a systematic survey of prompting methods in natural language processing</article-title><source>ACM Comput Surv</source><year>2023</year><volume>55</volume><issue>9</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3560815</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ward</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Kripalani</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mu&#x00F1;oz</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Association of physician coordination with interfacility transfer acceptance timeliness</article-title><source>Am J Accountable Care</source><year>2022</year><month>09</month><volume>10</volume><issue>3</issue><fpage>7</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.37765/ajac.2022.89231</pub-id><pub-id pub-id-type="medline">38617098</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harris</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Thielke</surname><given-names>R</given-names> </name><name name-style="western"><surname>Payne</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gonzalez</surname><given-names>N</given-names> </name><name name-style="western"><surname>Conde</surname><given-names>JG</given-names> </name></person-group><article-title>Research electronic data capture (REDCap)--a metadata-driven methodology and workflow process for providing translational research informatics support</article-title><source>J Biomed Inform</source><year>2009</year><month>04</month><volume>42</volume><issue>2</issue><fpage>377</fpage><lpage>381</lpage><pub-id pub-id-type="doi">10.1016/j.jbi.2008.08.010</pub-id><pub-id pub-id-type="medline">18929686</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harris</surname><given-names>PA</given-names> </name><name name-style="western"><surname>Taylor</surname><given-names>R</given-names> </name><name name-style="western"><surname>Minor</surname><given-names>BL</given-names> </name><etal/></person-group><article-title>The REDCap consortium: building an international community of software platform partners</article-title><source>J Biomed Inform</source><year>2019</year><month>07</month><volume>95</volume><fpage>103208</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2019.103208</pub-id><pub-id pub-id-type="medline">31078660</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>W</given-names> </name><etal/></person-group><article-title>A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions</article-title><source>ACM Trans Inf Syst</source><year>2025</year><volume>43</volume><issue>2</issue><fpage>1</fpage><lpage>55</lpage><pub-id pub-id-type="doi">10.1145/3703155</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>PY</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>TS</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><etal/></person-group><article-title>A survey on large language models for mathematical reasoning</article-title><source>ACM Comput Surv</source><year>2026</year><volume>58</volume><issue>8</issue><fpage>1</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1145/3786333</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Palm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Manikantan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mahal</surname><given-names>H</given-names> </name><name name-style="western"><surname>Belwadi</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Pepin</surname><given-names>ME</given-names> </name></person-group><article-title>Assessing the quality of AI-generated clinical notes: validated evaluation of a large language model ambient scribe</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1691499</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1691499</pub-id><pub-id pub-id-type="medline">41199808</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Croxford</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Pellegrino</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Development and validation of the provider documentation summarization quality instrument for large language models</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>06</month><day>1</day><volume>32</volume><issue>6</issue><fpage>1050</fpage><lpage>1060</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf068</pub-id><pub-id pub-id-type="medline">40323321</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kakaday</surname><given-names>R</given-names> </name><name name-style="western"><surname>Herrera</surname><given-names>EZ</given-names> </name><name name-style="western"><surname>Coskey</surname><given-names>O</given-names> </name><name name-style="western"><surname>Hertel</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Kaiser</surname><given-names>P</given-names> </name></person-group><article-title>The STREAMLINE pilot study on time reduction and efficiency in AI-mediated logging for improved note-taking experience</article-title><source>Appl Clin Inform</source><year>2025</year><month>05</month><volume>16</volume><issue>3</issue><fpage>614</fpage><lpage>621</lpage><pub-id pub-id-type="doi">10.1055/a-2559-5791</pub-id><pub-id pub-id-type="medline">40097146</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>TL</given-names> </name><name name-style="western"><surname>Hetherington</surname><given-names>TC</given-names> </name><name name-style="western"><surname>Stephens</surname><given-names>C</given-names> </name><etal/></person-group><article-title>AI-powered clinical documentation and clinicians&#x2019; electronic health record experience: a nonrandomized clinical trial</article-title><source>JAMA Netw Open</source><year>2024</year><month>09</month><day>3</day><volume>7</volume><issue>9</issue><fpage>e2432460</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.32460</pub-id><pub-id pub-id-type="medline">39240568</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Solet</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Norvell</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Rutan</surname><given-names>GH</given-names> </name><name name-style="western"><surname>Frankel</surname><given-names>RM</given-names> </name></person-group><article-title>Lost in translation: challenges and opportunities in physician-to-physician communication during patient handoffs</article-title><source>Acad Med</source><year>2005</year><month>12</month><volume>80</volume><issue>12</issue><fpage>1094</fpage><lpage>1099</lpage><pub-id pub-id-type="doi">10.1097/00001888-200512000-00005</pub-id><pub-id pub-id-type="medline">16306279</pub-id></nlm-citation></ref></ref-list></back></article>