<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e81915</article-id><article-id pub-id-type="doi">10.2196/81915</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Improving the Understandability of Clinical Guidelines: Development and Evaluation of a GPT-4&#x2013;Based Pipeline</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Jones</surname><given-names>Matthew D</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Torgbi</surname><given-names>Melissa</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tayyar Madabushi</surname><given-names>Harish</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Life Sciences, University of Bath</institution><addr-line>Claverton Down</addr-line><addr-line>Bath</addr-line><country>United Kingdom</country></aff><aff id="aff2"><institution>Department of Computer Science, University of Bath</institution><addr-line>Bath</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Gissel</surname><given-names>Christian</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Mira</surname><given-names>Jose</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Matthew D Jones, PhD, Department of Life Sciences, University of Bath, Claverton Down, Bath, BA2 7AY, United Kingdom, 44 1225383829; <email>M.D.Jones@bath.ac.uk</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>23</day><month>2</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e81915</elocation-id><history><date date-type="received"><day>05</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>21</day><month>01</month><year>2026</year></date><date date-type="accepted"><day>22</day><month>01</month><year>2026</year></date></history><copyright-statement>&#x00A9; Matthew D Jones, Melissa Torgbi, Harish Tayyar Madabushi. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 23.2.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e81915"/><abstract><sec><title>Background</title><p>Difficulty in finding and understanding information in clinical guidelines contributes to medication errors. Large language models (LLMs) can simplify complex text to aid in understanding, but this approach to improving the quality of guidelines has not been investigated. However, LLMs are also known to hallucinate or generate outputs that may not align with reality.</p></sec><sec><title>Objective</title><p>This study aimed to develop and evaluate an LLM pipeline to improve the readability of clinical guidelines while ensuring the preservation of critical content.</p></sec><sec sec-type="methods"><title>Methods</title><p>To align LLM revisions with research evidence and enable comparison with manual editing, the National Health Service Injectable Medicines Guide (IMG) was used as a case study, to which a GPT-4&#x2013;based pipeline was applied, with prompts based on user testing&#x2013;derived recommendations for IMG authors. This enabled readability comparisons between various IMG guideline versions: original, manually revised, or GPT-4&#x2013;revised using the user testing&#x2013;derived recommendations, and fully user tested. Readability was evaluated using readability metrics and ratings from 3 expert pharmacists. Content similarity before and after LLM revision was assessed using BERT (bidirectional encoder representations from transformers) scores and expert pharmacist review.</p></sec><sec sec-type="results"><title>Results</title><p>Considering 20 IMG guidelines used in practice, BERT scores indicated high semantic similarity between the original and LLM-revised guidelines (0.88-0.96). An omission, addition, or change in meaning was identified by at least one pharmacist in 30 (20%), 7 (5%), and 18 (12%) of the 153 guideline subsections, respectively. The SMOG (Simple Measure of Gobbledygook) grade showed a small but significant improvement in readability for the LLM-revised guidelines (mean difference 0.32, 95% CI 0.10&#x2010;0.55; <italic>P</italic>=.02) and the manually revised versions (mean difference 0.46, 95% CI 0.13&#x2010;0.79; <italic>P</italic>=.03). There was no significant difference between the LLM and manually revised versions (<italic>P</italic>&#x003E;.99). There were no significant differences between Flesch-Kincaid reading grades (<italic>P</italic>=.91). Expert ratings favored the LLM-revised versions for understandability. Considering 2 IMG guidelines from previous research, user testing produced a greater improvement in readability than LLM revision.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Authors should not use current LLMs to modify clinical guidelines without carefully checking the revised text for unintended omissions, additions, or changes in meaning. Further work should investigate the potential of LLMs to augment manual user testing and reduce the barriers to the wider use of this approach to improve the safety of clinical guidelines.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>clinical guidelines</kwd><kwd>comprehension</kwd><kwd>guidelines as topic</kwd><kwd>health care professionals</kwd><kwd>health personnel</kwd><kwd>large language models</kwd><kwd>patient safety</kwd><kwd>readability</kwd><kwd>understandability</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Medication errors are a leading cause of avoidable harm in health care systems, with an estimated international cost of US $42 billion per annum [<xref ref-type="bibr" rid="ref1">1</xref>]. Numerous factors can lead to a medication error [<xref ref-type="bibr" rid="ref2">2</xref>], one of which is difficulty in finding or understanding prescribing and medication administration information in clinical guidelines [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Consequently, guidelines that are contradictory, incomprehensible, or of poor quality are included in frameworks of the causes of patient safety incidents [<xref ref-type="bibr" rid="ref6">6</xref>]. Clarity of writing and formatting also influences guideline adoption [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Few studies have explored ways to help clinicians more easily find and understand guidelines. Some evidence suggests that iterative user testing and redesign can improve guideline usability by identifying and addressing problem areas [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. In one study, user-tested injectable medicines guidelines led to a 2.5-fold increase in error-free intravenous administration and a 96% probability of cost-effectiveness [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. Such improvements result from both the testing process and the application of good practice in information writing and design [<xref ref-type="bibr" rid="ref12">12</xref>]. Consequently, in 2020, format changes and clear writing advice were added to the authors&#x2019; instructions for the United Kingdom&#x2019;s National Health Service Injectable Medicines Guide (IMG). This is an online guide to preparing and administering injectable medicines, mainly used by nurses. Subsequently, more than 300 drug guidelines have been manually updated, but not user tested.</p><p>Most guideline-related medication errors involve documents produced by local health care organizations [<xref ref-type="bibr" rid="ref3">3</xref>]. User testing and revising these &#x201C;local guidelines&#x201D; could therefore yield the greatest safety improvements. As there are many thousands of these documents within a single country, limited expertise in information writing and design within health care organizations may hinder this approach.</p><p>Large language models (LLMs), such as GPT-4, may help overcome this barrier by simplifying complex text. This approach has been tested on health information for patients initially written by subject experts. Early results are mixed: LLMs can improve readability metrics [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref18">18</xref>] and patient understanding [<xref ref-type="bibr" rid="ref15">15</xref>] but can also worsen readability metrics [<xref ref-type="bibr" rid="ref19">19</xref>]. These gains may come at a cost, with some content omitted and up to 21% of revised content deemed clinically inappropriate [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. LLMs have also been used to improve understanding of specialist medical records [<xref ref-type="bibr" rid="ref20">20</xref>]. However, they have not yet been applied to improve the readability of clinical guidelines for health care professionals; therefore, it remains unclear whether LLMs can enhance readability without compromising content accuracy or safety.</p><p>Therefore, this study evaluated a GPT-4&#x2013;based pipeline to improve guideline readability while safeguarding against omission, addition, or change of meaning of safety-critical content. To enable comparison with manual revision and align with previous research, the pipeline was applied to the IMG. This enabled various readability comparisons (<xref ref-type="fig" rid="figure1">Figure 1</xref>), as well as comparison of the content of guidelines before and after LLM revision.</p><p>Specific objectives were therefore to (1) develop a GPT-4&#x2013;based pipeline to improve guideline clarity with built-in safeguards; (2) evaluate content similarity of 20 guidelines before and after revision by the GPT-4 pipeline; (3) compare the readability of 20 equivalent original guidelines, LLM-revised guidelines, and manually revised guidelines; and (4) compare the readability of original, LLM-revised, and user-tested aminophylline and voriconazole guidelines.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Readability comparisons reported in this study. &#x201C;Original&#x201D; versions were the guidelines in use in the Injectable Medicines Guide (IMG) in 2019. &#x201C;LLM-revised&#x201D; versions were developed from the original guidelines by the large language model (LLM) pipeline. &#x201C;Manually revised&#x201D; versions were the guidelines in use in the IMG in 2024, which had been updated by guideline authors in line with revised authors&#x2019; instructions introduced in 2020. &#x201C;User-tested&#x201D; versions were developed from the original guidelines via a user-testing process during a previous study [<xref ref-type="bibr" rid="ref9">9</xref>].</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81915_fig01.png"/></fig></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>This study is reported in accordance with the TRIPOD-LLM (Transparent Reporting of a Multivariable Model for Individual Prognosis or Diagnosis - Large Language Model) guideline [<xref ref-type="bibr" rid="ref21">21</xref>].</p><sec id="s2-1"><title>Development and Application of the GPT-4 Pipeline</title><p>IMG guidelines were converted to Markdown and paired with iteratively developed prompts for input to the LLM pipeline. Prompts instructed the LLM to improve clarity and readability without altering content. The LLM pipeline used the OpenAI API and the GPT-4 model. GPT-4&#x2013;generated &#x201C;LLM-revised&#x201D; Markdown versions of the original guidelines, which were then converted to static HTML for display.</p></sec><sec id="s2-2"><title>Prompting Strategies</title><p>Prompting was based on the Reframing Principles for Instruction-Following LLMs [<xref ref-type="bibr" rid="ref22">22</xref>], which offer tested strategies for constructing effective prompts, including using low-level patterns, examples, simplified sentence structures, and clear constraints. Initially, researchers with expertise in computer science (MT and HTM) constructed prompts based on the user testing&#x2013;derived recommendations incorporated into the IMG authors&#x2019; instructions in 2020 [<xref ref-type="bibr" rid="ref9">9</xref>]. To prevent omissions or changes in meaning, prompts required a semantic similarity score above 0.9 before proceeding. While GPT-4 cannot compute semantic accuracy, we hypothesized that it would interpret this instruction to ensure edits were semantically close to the original and that this reference would make the requirement more explicit. Selected outputs of these initial prompts were reviewed by a pharmacist with expertise in intravenous medication administration and information design (MDJ) and discussed with the wider research team. These initial tests found that overly prescriptive instructions for human authors caused overgeneralization and errors, including omissions and additions. Prompt instructions were then iteratively refined to improve effectiveness. The final prompts (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, online supplement) used for both comparisons were a product of this iterative development, incorporating insights from our explorations while omitting some instructions originally intended for human authors.</p><p>GPT-4&#x2013;generated instructions were also explored for comparison 2 (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The original and user-tested guidelines for voriconazole and aminophylline were given to GPT-4, which was instructed to generate prompts for another model to revise the original guidelines to be similar to the user-tested versions, without directly referencing them. However, these prompts were ineffective and were therefore excluded from final experiments.</p></sec><sec id="s2-3"><title>Selection of IMG Guidelines</title><p>Twenty adult IMG guidelines were selected in consultation with the IMG editorial team to cover a range of drugs, administration methods (infusion and injection), reconstitution or dilution needs, and higher-risk or complex cases. Versions from 2019 (&#x201C;original&#x201D;) and 2024 (&#x201C;manually revised,&#x201D; updated in line with revised IMG authors&#x2019; instructions) were obtained. Voriconazole and aminophylline guidelines from 2019 and their user-tested revisions were also sourced from prior research [<xref ref-type="bibr" rid="ref9">9</xref>].</p></sec><sec id="s2-4"><title>Guideline Content Similarity</title><p>Content similarity of the 20 IMG guidelines from practice before and after LLM revision was assessed using the bidirectional encoder representations from transformers (BERT) score [<xref ref-type="bibr" rid="ref23">23</xref>] and expert review. The BERT score calculates the similarity between each word or token in a candidate and reference sentence using cosine similarity and the contextual embeddings from the BERT model [<xref ref-type="bibr" rid="ref20">20</xref>]. We used the standard BERT score library to perform this calculation [<xref ref-type="bibr" rid="ref24">24</xref>]. Scores range from 0 to 1, with higher values indicating greater similarity.</p><p>Three intensive care pharmacists independently assessed 9 sections of LLM-revised guidelines (method of administration, reconstitution, dilution, expiry, flushing, adverse effects, extravasation, other comments, and compatibility) for clinically relevant omissions, additions, or changes in meaning compared with the originals. The pharmacists were known to the research team through professional networks and selected for their significant experience in intensive care pharmacy, although this was not formally defined. All invited pharmacists participated and were aware of the study aims, although they were blinded to guideline version while assessing content.</p><p>In addition, the outputs were evaluated manually by the research team for modifications to medication names or their order of presentation.</p></sec><sec id="s2-5"><title>Guideline Readability</title><p>Readability was assessed with 2 metrics and expert review. The readability metrics used were the SMOG (Simple Measure of Gobbledygook) grade and the Flesch-Kincaid reading grade. SMOG was chosen for its validation against full comprehension, while Flesch-Kincaid is widely used in health research [<xref ref-type="bibr" rid="ref25">25</xref>]. Both estimate the years of education required to understand a text, so larger scores indicate lower readability. Scores were calculated with the Textstat Python library using the entire guideline text.</p><p>The relative readability of the 20 IMG guidelines from practice before and after LLM revision was also independently assessed by the 3 intensive care pharmacists. Comparing the original and revised versions of each guideline, they rated relative overall and subsection-level understandability using a 5-point Likert scale.</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>Analyses were performed in Stata v18.0 (StataCorp LLC). Readability metrics and BERT scores were summarized as mean (SD), with differences tested using repeated measures ANOVA with post-hoc Bonferroni-corrected paired <italic>t</italic> tests. Expert ratings were summarized descriptively, and interrater reliability assessed using Gwet&#x2019;s AC to account for skewed distributions [<xref ref-type="bibr" rid="ref26">26</xref>].</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>This study was assessed as having low potential to do harm and thus reviewed and given a favorable opinion by proportionate review by the Department of Life Science Departmental Research Ethics Officer at the University of Bath on June 22, 2023 (number 442).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Pipeline Development</title><p>The LLM pipeline successfully produced output text showing revised versions of the 20 IMG guidelines from practice and the voriconazole and aminophylline guidelines from previous user-testing research. These revised guidelines were then evaluated for content similarity using the methods described earlier.</p></sec><sec id="s3-2"><title>Content Similarity of the Set of 20 Guidelines From Practice</title><p><xref ref-type="table" rid="table1">Table 1</xref> presents the BERT scores comparing the semantic similarity of the 20 IMG guidelines from practice before and after LLM revision. These ranged from 0.88 to 0.96, indicating a high degree of similarity. As a benchmark, Hanna and Bojar [<xref ref-type="bibr" rid="ref27">27</xref>] reported an average BERT score of 0.815 when comparing 2 different, professional human translations of the same source sentences across several distinct linguistic phenomena. The 3 pharmacists&#x2019; ratings of clinically relevant omissions, additions, or changes in meaning following LLM revision of the 20 guidelines from practice showed a high degree of interrater reliability, with Gwet&#x2019;s ACs of 0.87 (omissions), 0.97 (additions), and 0.92 (changes in meaning). <xref ref-type="fig" rid="figure2">Figures 2</xref><xref ref-type="fig" rid="figure3"/>-<xref ref-type="fig" rid="figure4">4</xref> show the number of pharmacist reviewers who identified an omission, addition, or change in meaning in each subsection. Of 459 individual subsection ratings (153 subsections each rated by 3 pharmacists), 54 (11.8%) identified an omission, 8 (1.7%) identified an addition, and 20 (4.4%) identified a change in meaning. An omission, addition, or change in meaning was identified by at least one pharmacist in 30 (20%), 7 (5%), and 18 (12%), respectively, of the 153 individual subsections. However, only 8 subsections had an omission identified by all 3 reviewers, and no additions or changes of meaning were identified by all 3 reviewers (<xref ref-type="table" rid="table2">Table 2</xref>). Overall, 65% (36/55) of subsection omissions, additions, or changes of meaning were only identified by 1 reviewer. These differences were concentrated in certain subsections (method of administration, reconstitution, dilution, adverse effects, and other comments) and guidelines (amiodarone, amoxicillin, furosemide, and propofol). Only one subsection (flushing) and two guidelines (levetiracetam and paracetamol) had no omissions, additions, or changes in meaning identified by any pharmacist.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>BERT (bidirectional encoder representations from transformers) scores comparing the semantic similarity of the 20 Injectable Medicines Guide guidelines from practice before and after large language model revision.<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Guideline</td><td align="left" valign="bottom">BERT score</td></tr><tr><td align="left" valign="bottom">Amiodarone</td><td align="left" valign="bottom">0.95</td></tr></thead><tbody><tr><td align="left" valign="top">Amoxicillin</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top">Ceftriaxone</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top">Cyclizine</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top">Fentanyl</td><td align="left" valign="top">0.96</td></tr><tr><td align="left" valign="top">Flucloxacillin</td><td align="left" valign="top">0.90</td></tr><tr><td align="left" valign="top">Furosemide</td><td align="left" valign="top">0.88</td></tr><tr><td align="left" valign="top">Gentamicin</td><td align="left" valign="top">0.93</td></tr><tr><td align="left" valign="top">Levetiracetam</td><td align="left" valign="top">0.92</td></tr><tr><td align="left" valign="top">Magnesium sulfate</td><td align="left" valign="top">0.90</td></tr><tr><td align="left" valign="top">Meropenem</td><td align="left" valign="top">0.93</td></tr><tr><td align="left" valign="top">Metronidazole</td><td align="left" valign="top">0.90</td></tr><tr><td align="left" valign="top">Noradrenaline</td><td align="left" valign="top">0.94</td></tr><tr><td align="left" valign="top">Omeprazole</td><td align="left" valign="top">0.94</td></tr><tr><td align="left" valign="top">Paracetamol</td><td align="left" valign="top">0.94</td></tr><tr><td align="left" valign="top">Phenytoin</td><td align="left" valign="top">0.88</td></tr><tr><td align="left" valign="top">Piperacillin and tazobactam</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top">Propofol</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top">Teicoplanin</td><td align="left" valign="top">0.93</td></tr><tr><td align="left" valign="top">Vancomycin</td><td align="left" valign="top">0.88</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Mean (SD): 0.92 (0.02).</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Heatmap showing the number of pharmacist reviewers who identified an omission of information following large language model revision of each subsection of each of the 20 original guidelines. Blank subsections were not present in the relevant guideline. AE: adverse effects; Compat.: compatibility; Expiry: expiry time; Extrav.: extravasation; Flush: flushing; Method: method of administration; Other: other comments; Recon.: reconstitution.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81915_fig02.png"/></fig><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Heatmap showing the number of pharmacist reviewers who identified an addition of information following large language model revision of each subsection of each of the 20 original guidelines. Blank subsections were not present in the relevant guideline. AE: adverse effects; Compat.: compatibility; Expiry: expiry time; Extrav.: extravasation; Flush: flushing; Method: method of administration; Other: other comments; Recon.: reconstitution.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81915_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Heatmap showing the number of pharmacist reviewers who identified a change in meaning following large language model revision of each subsection of each of the 20 original guidelines. Blank subsections were not present in the relevant guideline. AEs: adverse effects; Compat.: compatibility; Expiry: expiry time; Extrav.: extravasation; Flush: flushing; Method: method of administration; Other: other comments; Recon.: reconstitution.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81915_fig04.png"/></fig><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>The number of subsections with an identified omission, addition, or change of meaning following large language model revision of each of the 20 original guidelines.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="3">The number of subsections with an addition, omission, or change of meaning</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Identified by 1 reviewer</td><td align="left" valign="bottom">Identified by 2 reviewers</td><td align="left" valign="bottom">Identified by 3 reviewers</td></tr></thead><tbody><tr><td align="left" valign="top">Omission</td><td align="left" valign="top">14</td><td align="left" valign="top">8</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Addition</td><td align="left" valign="top">6</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">Change</td><td align="left" valign="top">16</td><td align="left" valign="top">2</td><td align="left" valign="top">0</td></tr></tbody></table></table-wrap><p>Key differences described in the pharmacists&#x2019; free text comments included the occasional omission of the contents of an entire subsection (eg, extravasation, other comments, and compatibility), the omission of one specific detail (eg, information relating to one of several doses of a medicine), objective changes in meaning (eg, &#x201C;hypotension&#x201D; changed to &#x201C;hypertension&#x201D; and &#x201C;mg/mL&#x201D; changed to &#x201C;mg&#x201D;) and subjective changes in interpretation (eg, the revised wording implying an adverse effect is probable rather than possible or lessening a safety warning).</p><p>Manual checks identified no alterations to medication names or order of presentation.</p></sec><sec id="s3-3"><title>Readability of the 20 Guidelines From Practice</title><p><xref ref-type="table" rid="table3">Table 3</xref> presents overall readability metrics for the 3 different versions of the IMG from practice. Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> shows the metrics calculated for each individual guideline. Repeated measures ANOVA identified a significant difference between the SMOG grades of at least 2 guideline versions (<italic>P</italic>=.007; df=59), so post hoc paired <italic>t</italic> tests with Bonferroni corrections were performed. The mean difference in SMOG grade between the original and LLM-revised versions was 0.32 (95% CI 0.10&#x2010;0.55; <italic>P</italic>=.02; df=19), indicating a small but significant improvement in readability following LLM revision. Similarly, the mean difference in SMOG grade between the original and manually revised versions was 0.46 (95% CI 0.13&#x2010;0.79; <italic>P</italic>=.03; df=19), indicating significant improvement in the readability of the guidelines following changes to the writing guide in 2020. However, the mean difference in SMOG grade between the LLM-revised and manually revised versions was 0.14 (95% CI &#x2212;0.18 to 0.45; <italic>P</italic>&#x003E;.99; df=19), suggesting no difference in the readability of these 2 versions. In contrast, repeated measures ANOVA did not identify differences between the Flesch-Kincaid reading grades of the original, LLM-revised, and manually revised versions (<italic>P</italic>=.91; df=59).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Summary readability metrics for 3 different versions of injectable medicines guides from practice for 20 intravenous drugs.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">SMOG<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> grade, mean (SD)</td><td align="left" valign="bottom">Flesch-Kincaid grade, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Original version</td><td align="left" valign="top">12.3 (0.2)</td><td align="left" valign="top">12.1 (0.4)</td></tr><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>-revised version</td><td align="left" valign="top">12.0 (0.2)</td><td align="left" valign="top">12.0 (0.4)</td></tr><tr><td align="left" valign="top">Manually revised version</td><td align="left" valign="top">11.8 (0.1)</td><td align="left" valign="top">11.9 (0.3)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>SMOG: Simple Measure of Gobbledygook.</p></fn><fn id="table3fn2"><p><sup>b</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><p>The 3 pharmacists&#x2019; ratings of the relative readability of the 20 guidelines from practice before and after LLM revision showed a high degree of interrater reliability, with Gwet&#x2019;s ACs of 0.75 (overall guideline) and 0.85 (subsections). <xref ref-type="table" rid="table4">Table 4</xref> presents the pharmacists&#x2019; ratings of the overall understandability of the guidelines, which favored the LLM-revised versions being easier to understand (26/60 ratings, 43.3%) compared with the original versions (11/60 ratings, 18.3%). Figure S1 (multimedia supplement) summarizes the pharmacists&#x2019; ratings of the relative readability of each subsection. Of 459 individual ratings, 56 favored the original version and 125 favored the LLM-revised version. For 6 of the 9 subsections and 17 of the 20 drugs, there were more ratings that favored the LLM-revised version than the original version. The pharmacists&#x2019; free text comments on relative readability frequently mentioned more concise language and a clearer layout for the LLM-revised versions resulting especially from the use of bullet points. However, they also noted that this was sometimes achieved through the omission of information.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>The 3 pharmacists&#x2019; ratings of the overall relative understandability of the 20 original and large language model (LLM)&#x2013;revised guidelines.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Number of ratings, n (%)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">2019 version is much easier to understand</td><td align="left" valign="top">1 (1.7)</td></tr><tr><td align="left" valign="top">2019 version is slightly easier to understand</td><td align="left" valign="top">10 (16.7)</td></tr><tr><td align="left" valign="top">Both versions are equally easy to understand</td><td align="left" valign="top">23 (38.3)</td></tr><tr><td align="left" valign="top">LLM-revised version is slightly easier to understand</td><td align="left" valign="top">25 (41.7)</td></tr><tr><td align="left" valign="top">LLM-revised version is much easier to understand</td><td align="left" valign="top">1 (1.7)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Three pharmacist reviewers each independently rated 20 guidelines, giving a total of 60 ratings.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Readability of the User- Tested Guidelines</title><p><xref ref-type="table" rid="table5">Table 5</xref> presents the readability metrics for 3 different versions of injectable medicines guides for aminophylline and voriconazole that underwent user testing and subsequent revision in a previous study [<xref ref-type="bibr" rid="ref9">9</xref>]. For aminophylline, LLM revision did not decrease the reading grade, but following user testing and revision, both readability metrics fell. For voriconazole, LLM revision decreased both readability grades, but user testing and subsequent revision decreased them even further.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Guideline readability metrics for three different versions of injectable medicines guides for aminophylline and voriconazole that underwent user testing and subsequent revision in a previous study [<xref ref-type="bibr" rid="ref9">9</xref>].</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Original version</td><td align="left" valign="bottom">LLM<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>-revised version</td><td align="left" valign="bottom">User-tested version</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">SMOG<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup> grade</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Aminophylline</td><td align="left" valign="top">11.7</td><td align="left" valign="top">11.8</td><td align="left" valign="top">11.0</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Voriconazole</td><td align="left" valign="top">11.9</td><td align="left" valign="top">10.9</td><td align="left" valign="top">10.6</td></tr><tr><td align="left" valign="top" colspan="4">Flesch-Kincaid grade</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Aminophylline</td><td align="left" valign="top">9.7</td><td align="left" valign="top">9.8</td><td align="left" valign="top">8.3</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Voriconazole</td><td align="left" valign="top">10.5</td><td align="left" valign="top">10.3</td><td align="left" valign="top">7.9</td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table5fn2"><p><sup>b</sup>SMOG: Simple Measure of G&#xFEFF;obbledygook.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The GPT-4&#x2013;based pipeline did not introduce clinically relevant omissions, additions, or changes of meaning to&#x2265;80% of individual subsections within the 20 guidelines from practice. However, content was entirely similar in only a few individual guidelines or subsection types, and differences (especially omissions) were concentrated in certain drugs or subsections in an unpredictable way. It is noteworthy that in the cases of furosemide, phenytoin, and vancomycin, GPT-4 did not maintain a BERT score above 0.9.</p><p>SMOG grades suggested that when following similar instructions or prompts, the LLM-based pipeline was as effective as manual editing in producing a small improvement in readability, and this was supported by the expert reviewers&#x2019; ratings of understandability. In contrast, the Flesch-Kincaid reading grade did not find any change in readability. This may be because the Flesch-Kincaid reading grade reflects a lower standard of comprehension than the SMOG grade [<xref ref-type="bibr" rid="ref25">25</xref>] but reinforces the marginal nature of the readability improvement. In addition, the GPT-4 pipeline did not produce improvements in either readability metric as large as those resulting from user testing and subsequent document revision.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>These results are aligned with those of studies that have applied LLMs to improve the readability of patient information, which have demonstrated improvements in readability metrics [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref18">18</xref>] that are often associated with the omission of information [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. This association between readability and omission of content is unsurprising, as the removal of complex concepts is likely to make a text easier to understand. However, in a safety-critical context, such as health care, the omission of essential information must be avoided, and evidence to date suggests that LLMs are currently unable to achieve this reliably. In contrast, an iterative process of user testing and subsequent document revision has a lower risk of introducing unintended content changes. In addition, while this study has demonstrated that user testing can improve the readability metrics of clinical guidelines, it has also been shown to have beneficial effects on more important user- or patient-centered outcomes, such as reducing difficulties finding and understanding information [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], and subsequently preventing guideline-related medication errors [<xref ref-type="bibr" rid="ref11">11</xref>].</p><p>Therefore, user testing remains the guideline improvement technique with the strongest evidence base. However, the findings of this study suggest that specifically designed LLM-based pipelines with appropriate safeguards do have potential to augment (but not replace) a manual user testing process, by providing suggestions for potential improvements during the document revision process. These safeguards should be included as explicit clinician-guided, hard-coded programmatic feature checks (eg, verifying that all medications are included and in the correct order) rather than as an instruction to the LLM, which may be ignored. This approach has the potential to partially address the limited availability of knowledge and skills in information writing and design within health care organizations.</p></sec><sec id="s4-3"><title>Strengths and Limitations</title><p>This study has several strengths, including the use of specific prompts based on previous research and the use of both manually revised and user-tested monographs as comparators. In addition, it evaluated both the content and readability of the guidelines using a combination of calculated metrics and human evaluation, an approach recently recommended for studies in this field [<xref ref-type="bibr" rid="ref29">29</xref>]. However, there are also a number of limitations. First, user- and patient-centered outcomes might have been evaluated (eg, the ability of end users to find and understand information in the guidelines; the frequency of medication errors when using revised guidelines), so it is not clear whether the observed small improvements in readability would result in improved patient care. This could be addressed by future randomized studies comparing original and LLM-revised guidelines in actual or simulated patient care. Second, this study used a general LLM rather than a bespoke model trained with health care&#x2013;specific information [<xref ref-type="bibr" rid="ref29">29</xref>], which might have resulted in greater improvements in readability and/or fewer content changes. Such models should be used in future studies. Third, while the results were replicable among the 22 drug guides tested in this study, only one type of guideline relating to intravenous drug administration was used. Therefore, results may not be replicable to other types of guideline, especially as the prompting strategy was based on user testing of these specific guidelines. Other types of guidelines and more general prompting strategies should be investigated in future studies. Fourth, the inclusion of a prompt based on semantic similarity did not provide a reliable safeguard against content changes, as the BERT score for 3 guidelines was &#x003C;0.9 (<xref ref-type="table" rid="table1">Table 1</xref>). Future studies should implement semantic similarity thresholds as programmatic post hoc checks (compute the BERT score externally and rerun or reject LLM outputs) and investigate more extensive use of clinician-guided hard-coded programmatic feature checks (eg, verifying that all medications are included and in the correct order). Fifth, the prompting strategy was developed by only a small team, so it is more likely to have been affected by participant subjectivity. This could be addressed by developing prompts among a larger team with a greater range of expertise. Sixth, instruction 9 in the GPT-4 prompts (&#x201C;remove information about the size of the bags&#x201D;) was based on a misinterpretation of previous user testing&#x2013;derived recommendations. Six of 12 content review panel comments describing omissions from the &#x201C;Dilution&#x201D; subsection related to the clinically relevant omission of infusion bag volume, suggesting this error contributed to some of the unintended omissions identified in this study.</p></sec><sec id="s4-4"><title>Implications for Practice and Future Research</title><p>The present findings suggest that guideline authors should not use current LLMs to modify clinical guidelines without carefully checking the revised text for unintended omissions, additions, or changes of meaning. Similarly, clinicians should not use current LLMs to answer clinical questions without carefully checking the results against reliable sources of information. However, LLMs do have the potential to augment manual user testing and reduce the barriers to the wider use of this approach to improve the safety of the thousands of local clinical guidelines currently in use. Further work is required to embed LLM functionality into guideline authoring systems and subsequently demonstrate the safety and effectiveness of this approach.</p></sec><sec id="s4-5"><title>Conclusions</title><p>LLMs are able to revise the contents of clinical guidelines to produce small improvements in readability, but with unintended omissions, additions, or changes of meaning to a minority of the information. Small improvements in readability were evident using the SMOG grade and expert review, but not with the Flesch-Kincaid reading grade. It is not known whether these readability improvements would lead to better understanding of, or adherence to, clinical guidelines when used in practice. Omissions of information accounted for the majority of unintended changes to the information contained within the guidelines. Guideline-related medication errors continue to occur but may be prevented by the combination of human expertise, user feedback, and appropriate technologies.</p></sec></sec></body><back><ack><p>The authors are grateful to members of the Injectable Medicines Guide editorial team, the pharmacist reviewers, Holly Wilson (Onaya Science), and Joseph Marvin Imperial (University of Bath) for their support of this study. Microsoft Copilot (GPT-5 model) was used during manuscript revision to edit the Introduction and Methods sections for brevity.</p></ack><notes><sec><title>Funding</title><p>This project was funded by a Pump Priming Award from the Department of Life Sciences, University of Bath. Additional funding was received from the Academic Secondment Scheme of Research and Innovation Services at the University of Bath. These funders had no role in study design; collection, analysis, and interpretation of data; writing of the paper; and/or decision to submit for publication.</p></sec><sec><title>Data Availability</title><p>Data and code are available in a public, open access repository. Data created during this research that are not presented in full in this paper are openly available from [<xref ref-type="bibr" rid="ref30">30</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>MDJ: conceptualization, methodology, formal analysis, investigation, writing &#x2013; original draft, funding acquisition</p><p>MT: software, investigation, writing &#x2013; review and editing</p><p>HTM: conceptualization, methodology, resources, writing &#x2013; review and editing</p><p>MDJ is the guarantor.</p></fn><fn fn-type="conflict"><p>MDJ is a member of the National Health Service Injectable Medicines Guide advisory board and held an honorary position in the editorial team at the start of this project. The other authors have no competing interests.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>bidirectional encoder representations from transformers</p></def></def-item><def-item><term id="abb2">IMG</term><def><p>Injectable Medicines Guide</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">SMOG</term><def><p>Simple Measure of Gobbledygook</p></def></def-item><def-item><term id="abb5">TRIPOD-LLM</term><def><p>Transparent Reporting of a Multivariable Model for Individual Prognosis or Diagnosis - Large Language Model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="report"><article-title>Medication without harm - global patient safety challenge on medication safety</article-title><year>2017</year><access-date>2026-01-29</access-date><publisher-name>World Health Organization</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://iris.who.int/server/api/core/bitstreams/15520c4f-89d1-4a8f-9bab-6b771aa30acb/content">https://iris.who.int/server/api/core/bitstreams/15520c4f-89d1-4a8f-9bab-6b771aa30acb/content</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naseralallah</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stewart</surname><given-names>D</given-names> </name><name name-style="western"><surname>Azfar Ali</surname><given-names>R</given-names> </name><name name-style="western"><surname>Paudyal</surname><given-names>V</given-names> </name></person-group><article-title>An umbrella review of systematic reviews on contributory factors to medication errors in health-care settings</article-title><source>Expert Opin Drug Saf</source><year>2022</year><month>11</month><volume>21</volume><issue>11</issue><fpage>1379</fpage><lpage>1399</lpage><pub-id pub-id-type="doi">10.1080/14740338.2022.2147921</pub-id><pub-id pub-id-type="medline">36408597</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Powell</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Exploring the role of guidelines in contributing to medication errors: a descriptive analysis of national patient safety incident data</article-title><source>Drug Saf</source><year>2024</year><month>04</month><volume>47</volume><issue>4</issue><fpage>389</fpage><lpage>400</lpage><pub-id pub-id-type="doi">10.1007/s40264-024-01396-7</pub-id><pub-id pub-id-type="medline">38308152</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahomedradja</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Schinkel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sigaloff</surname><given-names>KCE</given-names> </name><etal/></person-group><article-title>Factors influencing in-hospital prescribing errors: a systematic review</article-title><source>Br J Clin Pharmacol</source><year>2023</year><month>06</month><volume>89</volume><issue>6</issue><fpage>1724</fpage><lpage>1735</lpage><pub-id pub-id-type="doi">10.1111/bcp.15694</pub-id><pub-id pub-id-type="medline">36805648</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Clarke</surname><given-names>J</given-names> </name><name name-style="western"><surname>Feather</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Use of pediatric injectable medicines guidelines and associated medication administration errors: a human reliability analysis</article-title><source>Ann Pharmacother</source><year>2021</year><month>11</month><volume>55</volume><issue>11</issue><fpage>1333</fpage><lpage>1340</lpage><pub-id pub-id-type="doi">10.1177/1060028021999647</pub-id><pub-id pub-id-type="medline">33641479</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lawton</surname><given-names>R</given-names> </name><name name-style="western"><surname>McEachan</surname><given-names>RRC</given-names> </name><name name-style="western"><surname>Giles</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Sirriyeh</surname><given-names>R</given-names> </name><name name-style="western"><surname>Watt</surname><given-names>IS</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>J</given-names> </name></person-group><article-title>Development of an evidence-based framework of factors contributing to patient safety incidents in hospital settings: a systematic review</article-title><source>BMJ Qual Saf</source><year>2012</year><month>05</month><volume>21</volume><issue>5</issue><fpage>369</fpage><lpage>380</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2011-000443</pub-id><pub-id pub-id-type="medline">22421911</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kastner</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bhattacharyya</surname><given-names>O</given-names> </name><name name-style="western"><surname>Hayden</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Guideline uptake is influenced by six implementability domains for creating and communicating guidelines: a realist review</article-title><source>J Clin Epidemiol</source><year>2015</year><month>05</month><volume>68</volume><issue>5</issue><fpage>498</fpage><lpage>509</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2014.12.013</pub-id><pub-id pub-id-type="medline">25684154</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raynor</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Veene</surname><given-names>PD</given-names> </name><name name-style="western"><surname>Bryant</surname><given-names>D</given-names> </name></person-group><article-title>The effectiveness of the summary of product characteristics (SmPC) and recommendations for improvement</article-title><source>Ther Innov Regul Sci</source><year>2014</year><month>03</month><volume>48</volume><issue>2</issue><fpage>255</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1177/2168479013501311</pub-id><pub-id pub-id-type="medline">30227505</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Watson</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Raynor</surname><given-names>DK</given-names> </name></person-group><article-title>User testing to improve retrieval and comprehension of information in guidelines to improve medicines safety</article-title><source>J Patient Saf</source><year>2022</year><month>01</month><day>1</day><volume>18</volume><issue>1</issue><fpage>e172</fpage><lpage>e179</lpage><pub-id pub-id-type="doi">10.1097/PTS.0000000000000723</pub-id><pub-id pub-id-type="medline">32569098</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Raynor</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Thom</surname><given-names>H</given-names> </name><name name-style="western"><surname>Watson</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Kandiyali</surname><given-names>R</given-names> </name></person-group><article-title>Costs and cost-effectiveness of user-testing of health professionals&#x2019; guidelines to reduce the frequency of intravenous medicines administration errors by nurses in the United Kingdom: a probabilistic model based on voriconazole administration</article-title><source>Appl Health Econ Health Policy</source><year>2022</year><month>01</month><volume>20</volume><issue>1</issue><fpage>91</fpage><lpage>104</lpage><pub-id pub-id-type="doi">10.1007/s40258-021-00675-z</pub-id><pub-id pub-id-type="medline">34403128</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>McGrogan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Raynor</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Watson</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>BD</given-names> </name></person-group><article-title>User-testing guidelines to improve the safety of intravenous medicines administration: a randomised in situ simulation study</article-title><source>BMJ Qual Saf</source><year>2021</year><month>01</month><volume>30</volume><issue>1</issue><fpage>17</fpage><lpage>26</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2020-010884</pub-id><pub-id pub-id-type="medline">32606212</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Theo Raynor</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Blackwell</surname><given-names>K</given-names> </name><name name-style="western"><surname>Middleton</surname><given-names>W</given-names> </name></person-group><article-title>What do writers need to know about user testing?</article-title><source>Med Writ</source><year>2015</year><month>12</month><volume>24</volume><issue>4</issue><fpage>215</fpage><lpage>218</lpage><pub-id pub-id-type="doi">10.1179/2047480615Z.000000000326</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moons</surname><given-names>P</given-names> </name><name name-style="western"><surname>Van Bulck</surname><given-names>L</given-names> </name></person-group><article-title>Using ChatGPT and Google Bard to improve the readability of written patient information: a proof of concept</article-title><source>Eur J Cardiovasc Nurs</source><year>2024</year><month>03</month><day>12</day><volume>23</volume><issue>2</issue><fpage>122</fpage><lpage>126</lpage><pub-id pub-id-type="doi">10.1093/eurjcn/zvad087</pub-id><pub-id pub-id-type="medline">37603843</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haver</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>CT</given-names> </name><name name-style="western"><surname>Sirajuddin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yi</surname><given-names>PH</given-names> </name><name name-style="western"><surname>Jeudy</surname><given-names>J</given-names> </name></person-group><article-title>Use of ChatGPT, GPT-4, and Bard to improve readability of ChatGPT's answers to common questions about lung cancer and lung cancer screening</article-title><source>AJR Am J Roentgenol</source><year>2023</year><month>11</month><volume>221</volume><issue>5</issue><fpage>701</fpage><lpage>704</lpage><pub-id pub-id-type="doi">10.2214/AJR.23.29622</pub-id><pub-id pub-id-type="medline">37341179</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salam</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kravchenko</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mesropyan</surname><given-names>N</given-names> </name><etal/></person-group><article-title>ChatGPT makes cardiovascular MRI reports easy-to-understand: a feasibility study</article-title><source>J Cardiovasc Magn Reson</source><year>2024</year><volume>26</volume><fpage>100881</fpage><pub-id pub-id-type="doi">10.1016/j.jocmr.2024.100881</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Patel</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Fleischer</surname><given-names>L</given-names> </name><name name-style="western"><surname>Filip</surname><given-names>P</given-names> </name><etal/></person-group><article-title>The use of artificial intelligence to improve readability of otolaryngology patient education materials</article-title><source>Otolaryngol Head Neck Surg</source><year>2024</year><month>08</month><volume>171</volume><issue>2</issue><fpage>603</fpage><lpage>608</lpage><pub-id pub-id-type="doi">10.1002/ohn.816</pub-id><pub-id pub-id-type="medline">38751109</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kianian</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rojas-Carabali</surname><given-names>W</given-names> </name><name name-style="western"><surname>Agrawal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tsui</surname><given-names>E</given-names> </name></person-group><article-title>Large language models may help patients understand peer-reviewed scientific articles about ophthalmology: development and usability study</article-title><source>J Med Internet Res</source><year>2024</year><month>12</month><day>24</day><volume>26</volume><fpage>e59843</fpage><pub-id pub-id-type="doi">10.2196/59843</pub-id><pub-id pub-id-type="medline">39719077</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Swisher</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>GC</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Carle</surname><given-names>TR</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>DM</given-names> </name></person-group><article-title>Enhancing health literacy: evaluating the readability of patient handouts revised by ChatGPT&#x2019;s large language model</article-title><source>Otolaryngol Head Neck Surg</source><year>2024</year><month>12</month><volume>171</volume><issue>6</issue><fpage>1751</fpage><lpage>1757</lpage><pub-id pub-id-type="doi">10.1002/ohn.927</pub-id><pub-id pub-id-type="medline">39105460</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stoneham</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Walker</surname><given-names>LC</given-names> </name><name name-style="western"><surname>Newman</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Nicholls</surname><given-names>A</given-names> </name><name name-style="western"><surname>Avis</surname><given-names>D</given-names> </name></person-group><article-title>Can artificial intelligence make elective hand clinic letters easier for patients to understand?</article-title><source>J Hand Surg Eur Vol</source><year>2024</year><month>11</month><volume>49</volume><issue>10</issue><fpage>1269</fpage><lpage>1270</lpage><pub-id pub-id-type="doi">10.1177/17531934241246479</pub-id><pub-id pub-id-type="medline">38641940</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>MW</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name></person-group><article-title>BERT: pre-training of deep bidirectional transformers for language understanding</article-title><access-date>2026-01-29</access-date><conf-name>Proceedings of the 2019 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)</conf-name><conf-date>Jun 2-7, 2019</conf-date><conf-loc>Minneapolis, Minnesota</conf-loc><fpage>4171</fpage><lpage>4186</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/N19-1423/">https://aclanthology.org/N19-1423/</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/N19-1423</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Mishra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khashabi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Baral</surname><given-names>C</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Hajishirzi</surname><given-names>H</given-names> </name></person-group><article-title>Reframing instructional prompts to GPTk&#x2019;s language</article-title><year>2021</year><access-date>2026-01-29</access-date><conf-name>Findings of the Association for Computational Linguistics: ACL 2022</conf-name><conf-date>May 22-27, 2022</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2022.findings-acl.50/">https://aclanthology.org/2022.findings-acl.50/</ext-link></comment><pub-id pub-id-type="doi">10.18653/v1/2022.findings-acl.50</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kishore</surname><given-names>V</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name><name name-style="western"><surname>Artzi</surname><given-names>Y</given-names> </name></person-group><article-title>BERTScore: evaluating text generation with BERT</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 21, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1904.09675</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>BERT score for text generation</article-title><source>GitHub, Inc</source><year>2023</year><access-date>2026-01-29</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/Tiiiger/bert_score">https://github.com/Tiiiger/bert_score</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Schmitt</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Wen</surname><given-names>FK</given-names> </name></person-group><article-title>Assessing readability formula differences with written health information materials: application, results, and recommendations</article-title><source>Res Social Adm Pharm</source><year>2013</year><volume>9</volume><issue>5</issue><fpage>503</fpage><lpage>516</lpage><pub-id pub-id-type="doi">10.1016/j.sapharm.2012.05.009</pub-id><pub-id pub-id-type="medline">22835706</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gwet</surname><given-names>KL</given-names> </name></person-group><article-title>Computing inter-rater reliability and its variance in the presence of high agreement</article-title><source>Br J Math Stat Psychol</source><year>2008</year><month>05</month><volume>61</volume><issue>Pt 1</issue><fpage>29</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1348/000711006X126600</pub-id><pub-id pub-id-type="medline">18482474</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Hanna</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bojar</surname><given-names>O</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Barrault</surname><given-names>L</given-names> </name><name name-style="western"><surname>Bojar</surname><given-names>O</given-names> </name><name name-style="western"><surname>Bougares</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chatterjee</surname><given-names>R</given-names> </name><name name-style="western"><surname>Costa-jussa</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Federmann</surname><given-names>C</given-names> </name></person-group><article-title>A fine-grained analysis of BERT score</article-title><access-date>2026-01-29</access-date><conf-name>Proceedings of the Sixth Conference on Machine Translation</conf-name><conf-date>Nov 10-11, 2021</conf-date><fpage>507</fpage><lpage>517</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2021.wmt-1.59/">https://aclanthology.org/2021.wmt-1.59/</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verhoeven</surname><given-names>F</given-names> </name><name name-style="western"><surname>Steehouder</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Hendrix</surname><given-names>RMG</given-names> </name><name name-style="western"><surname>Van Gemert-Pijnen</surname><given-names>JEWC</given-names> </name></person-group><article-title>From expert-driven to user-oriented communication of infection control guidelines</article-title><source>Int J Hum Comput Stud</source><year>2010</year><month>06</month><volume>68</volume><issue>6</issue><fpage>328</fpage><lpage>343</lpage><pub-id pub-id-type="doi">10.1016/j.ijhcs.2009.07.003</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Madabushi</surname><given-names>HT</given-names> </name><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name></person-group><article-title>Large language models in healthcare information research: making progress in an emerging field</article-title><source>BMJ Qual Saf</source><year>2025</year><month>01</month><day>28</day><volume>34</volume><issue>2</issue><fpage>73</fpage><lpage>76</lpage><pub-id pub-id-type="doi">10.1136/bmjqs-2024-017896</pub-id><pub-id pub-id-type="medline">39443104</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Torgbi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tayyar Madabushi</surname><given-names>H</given-names> </name></person-group><article-title>Large language models to improve the understandability of clinical guidelines: an evaluation of readability improvements and unintended content changes produced by GPT-4</article-title><source>GitHub</source><year>2026</year><access-date>2026-02-19</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/melissatorgbi/LLM-Clinical-Guideline-Understandability">https://github.com/melissatorgbi/LLM-Clinical-Guideline-Understandability</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Large language model prompts and additional data.</p><media xlink:href="jmir_v28i1e81915_app1.pdf" xlink:title="PDF File, 134 KB"/></supplementary-material></app-group></back></article>