<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e78393</article-id><article-id pub-id-type="doi">10.2196/78393</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Retrieval-Augmented Generation Large Language Models in Guideline-Concordant Prostate-Specific Antigen Testing: Comparative Study With Junior Clinicians</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Tung</surname><given-names>Joshua Yi Min</given-names></name><degrees>MBBS, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Le</surname><given-names>Quan</given-names></name><degrees>BEng, MITB</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yao</surname><given-names>Jinxuan</given-names></name><degrees>BEng, MTech</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Huang</surname><given-names>Yifei</given-names></name><degrees>BCS, MTech</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lim</surname><given-names>Daniel Yan Zheng</given-names></name><degrees>MBBS, MTech</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sng</surname><given-names>Gerald Gui Ren</given-names></name><degrees>MBBS, MPH, MMed</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lau</surname><given-names>Rachel Shu En</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tan</surname><given-names>Yu Guang</given-names></name><degrees>MBBS, MCI</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Kenneth</given-names></name><degrees>MBBS, MCI</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tay</surname><given-names>Kae Jack</given-names></name><degrees>MBBS, MMed, MCI</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tan</surname><given-names>Jen Hong</given-names></name><degrees>BEng, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yuen</surname><given-names>John Shyi Peng</given-names></name><degrees>MBBS, MMed, DPhil</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cheng</surname><given-names>Christopher Wai Sam</given-names></name><degrees>MBBS, MMed</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ho</surname><given-names>Henry Sun Sien</given-names></name><degrees>MBBS, MMed</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>Data Science and Artificial Intelligence Laboratory, Singapore General Hospital</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff2"><institution>Department of Urology, Singapore General Hospital</institution><addr-line>Block 4 Level 1, 16 College Road</addr-line><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff3"><institution>Department of Gastroenterology, Singapore General Hospital</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff4"><institution>Department of Endocrinology, Singapore General Hospital</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Beltr&#x00E3;o</surname><given-names>Monique</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Modebelu</surname><given-names>Ukamaka</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Yijun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Joshua Yi Min Tung, MBBS, MPH, Department of Urology, Singapore General Hospital, Block 4 Level 1, 16 College Road, Singapore, 169854, Singapore, 65 62223322; <email>joshua.tung@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>19</day><month>11</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e78393</elocation-id><history><date date-type="received"><day>02</day><month>06</month><year>2025</year></date><date date-type="rev-recd"><day>12</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>12</day><month>10</month><year>2025</year></date></history><copyright-statement>&#x00A9; Joshua Yi Min Tung, Quan Le, Jinxuan Yao, Yifei Huang, Daniel Yan Zheng Lim, Gerald Gui Ren Sng, Rachel Shu En Lau, Yu Guang Tan, Kenneth Chen, Kae Jack Tay, Jen Hong Tan, John Shyi Peng Yuen, Christopher Wai Sam Cheng, Henry Sun Sien Ho. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 19.11.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e78393"/><abstract><sec><title>Background</title><p>Prostate-specific antigen (PSA) testing remains the cornerstone of early prostate cancer detection. Society guidelines for prostate cancer screening via PSA testing serve to standardize patient care and are often used by trainees, junior staff, or generalist medical practitioners to guide medical decision-making. However, adherence to guidelines is a time-consuming and challenging task, and rates of inappropriate PSA testing are high. Retrieval-augmented generation (RAG) is a method to enhance the reliability of large language models (LLMs) by grounding responses in trusted external sources.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate a RAG-enhanced LLM system, grounded in current European Association of Urology and American Urological Association guidelines, to assess its effectiveness in providing guideline-concordant PSA screening recommendations compared to junior clinicians.</p></sec><sec sec-type="methods"><title>Methods</title><p>A series of 44 fictional outpatient case scenarios was developed to represent a broad spectrum of clinical presentations. A RAG pipeline was developed, comprising a life expectancy estimation module based on the Charlson Comorbidity Index, followed by LLM-generated recommendations constrained to retrieved excerpts from the European Association of Urology and American Urological Association guidelines. Five junior clinicians were tasked to provide PSA testing recommendations for the same scenarios in closed-book and open-book formats. Answers were compared for accuracy in a binomial fashion. Fleiss &#x03BA; was computed to assess interrater agreement among clinicians.</p></sec><sec sec-type="results"><title>Results</title><p>The RAG-LLM tool provided guideline-concordant recommendations in 95.5% (210/220) of case scenarios, compared to junior clinicians, who were correct in 62.3% (137/220) of scenarios in a closed-book format and 74.1% (163/220) of scenarios in an open-book format. The difference was statistically significant for both closed-book (<italic>P</italic>&#x003C;.001) and open-book (<italic>P</italic>&#x003C;.001) formats. Interrater agreement among clinicians was fair, with Fleiss &#x03BA; of 0.294 and 0.321 for closed-book and open-book formats, respectively.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Use of RAG techniques allows LLMs to integrate complex guidelines into day-to-day medical decision-making. RAG-LLM tools in urology have the capability to enhance clinical decision-making by providing guideline-concordant recommendations for PSA testing, potentially improving the consistency of health care delivery, reducing cognitive load on clinicians, and reducing unnecessary investigations and costs. While this study used synthetic cases in a controlled simulation environment, it establishes a foundation for future validation in real-world clinical settings.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>guideline concordance</kwd><kwd>junior clinician</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Prostate cancer is the second most commonly diagnosed cancer and the fifth leading cause of cancer-related death among men globally [<xref ref-type="bibr" rid="ref1">1</xref>]. Screening for prostate cancer is thus a common issue in both primary and specialist care settings. Prostate-specific antigen (PSA) testing is the most widely used method for early detection, but remains a controversial issue in urological literature, largely owing to the harms associated with overdiagnosis and overtreatment [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Society guidelines for prostate cancer screening via PSA testing serve to streamline and standardize patient care and are often used by trainees, junior staff, or nonspecialist medical practitioners to guide medical decision-making. Such guidelines have been issued by various organizations such as the European Association of Urology (EAU) [<xref ref-type="bibr" rid="ref4">4</xref>] and American Urological Association (AUA) [<xref ref-type="bibr" rid="ref5">5</xref>], but discrepancies between these guidelines, such as recommendations on whether PSA screening should be offered, the appropriate patient populations, and screening intervals, pose challenges for clinical decision-making. These are further complicated by the need to consider other patient factors, such as the need to calculate estimated life expectancy (as many guidelines do not recommend PSA screening in patients with a &#x003C;10- or &#x003C;15-year life expectancy), and the need to consider the patient&#x2019;s own preferences. Shared decision-making forms a key component in both the EAU and AUA guidelines, particularly in older men or those with multiple medical comorbidities.</p><p>The current EAU-European Association of Nuclear Medicine-European Society for Radiotherapy and Oncology-European Society of Urogenital Radiology-International Society of Urological Pathology-International Society of Geriatric Oncology and AUA and Society of Urologic Oncology guidelines on prostate cancer and early detection of prostate cancer stand at 239 and 47 pages, respectively. Appropriate decision-making and adherence to guidelines is therefore a time-consuming and challenging task for nonspecialists in a primary care setting, as well as for specialists in outpatient settings where time constraints are common. Prior studies have shown a low rate of compliance to organizational guidelines, such as a cohort study of 32,306 men showing that 40% of those aged &#x003E;80 years received inappropriate PSA screening [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>One potential solution to this problem is to use artificial intelligence (AI) to parse guidelines and deliver an appropriate recommendation. Large language models (LLMs) are a form of AI that are trained on large amounts of text data and hence have the capability to process unstructured text inputs and generate appropriate responses. They can thus be applied in health care, such as in patient communications, education, and clinical risk stratification [<xref ref-type="bibr" rid="ref7">7</xref>]. However, general LLMs, such as the GPT models developed by OpenAI, are not specifically designed for health care use and can produce inaccurate or misleading information. They have a knowledge cutoff based on the recency of the underlying training data, for example, January 2022 for the OpenAI GPT-4 models. To address these limitations, retrieval-augmented generation (RAG) techniques have been developed to enhance the accuracy of LLMs. RAG directs the LLM to answer a given scenario by referencing an additional database of curated information, such as a set of guidelines. By grounding the responses using relevant information from the database, LLMs can overcome their intrinsic knowledge cutoff and produce responses with less hallucination [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Thus, the aim of this study was to evaluate the accuracy of a RAG-enabled LLM that had been grounded in the EAU and AUA guidelines pertaining to prostate cancer screening.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This study was conducted in a simulated environment using only fictional patient data. As the use of fictional data does not fall under local Human Biomedical Research Act regulations, ethics approval was not required.</p></sec><sec id="s2-2"><title>Development of Case Scenarios</title><p>A series of 44 fictional case scenarios was developed to reflect a range of clinical presentations at an outpatient clinic setting. These free-text scenarios included fictional patient biodata such as age, medical comorbidities, presence or absence of urological symptoms (eg, hematuria or lower urinary tract symptoms, if any), and prior PSA readings (if applicable). These were written by a urology fellow with 8 years of clinical experience and supervised by 2 urology consultants with &#x003E;20 years of clinical experience each.</p></sec><sec id="s2-3"><title>Development of the RAG-Enabled LLM</title><p>We developed an automated pipeline to process case scenarios based on how a health care provider would provide a PSA testing recommendation. The schematic diagram is shown in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Workflow schematic of the retrieval-augmented generation&#x2013;enabled large language model pipeline for prostate-specific antigen (PSA) testing recommendations. AUA: American Urological Association; CCI: Charlson Comorbidity Index; CHF: congestive heart failure; EAU: European Association of Urology.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e78393_fig01.png"/></fig><p>Key components of this pipeline included an LLM-based calculator to extract relevant patient information (age and comorbidities) from the case scenario, to calculate the Charlson Comorbidity Index (CCI) and thereby estimate the expected 10-year life expectancy. Patients who were not expected to live at least 10 years were not recommended for PSA screening [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], and the pipeline did not allow such case scenarios to proceed. Likewise, scenarios where the patient was aged &#x003E;72 years were also not permitted to proceed. We provide further technical details of the CCI calculator in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>For patients with at least a 10-year life expectancy based on CCI scores, a RAG-enabled LLM was used to provide a recommendation based on the given case scenario. In comparison with standard &#x201C;off-the-shelf&#x201D; LLMs that are not trained on domain-specific medical information, RAG allows the LLM to reference a fixed set of material, such as the relevant EAU and AUA society guidelines in this study. Language models augmented in this way with contextualized information can overcome their intrinsic knowledge deficits and reduce hallucination by constraining their responses to the provided information.</p><p>Because the AUA and EAU guidelines occasionally provide different and nonoverlapping recommendations, separate answers were first generated from each set of guidelines and then combined to produce the final recommendations.</p><p>We provide further technical details of the RAG-enabled LLM in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. These include explanations of modern RAG techniques applied to optimize performance, such as context filtering to improve retrieval of relevant information and advanced prompting methods (chain-of-thought reasoning [<xref ref-type="bibr" rid="ref13">13</xref>], constraining responses to retrieved information, providing example output structures, and using an expert clinician persona). The full RAG prompt can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>].</p></sec><sec id="s2-4"><title>Relevant Software</title><p>The RAG prototype was developed with Python (version 3.10; Python Software Foundation). Vector databases were constructed using Unstructured API for ingestion of PDF documents, OpenAI API for generation of text embeddings, and Qdrant as the vector database. For LLM calls, we used both OpenAI and Anthropic APIs for different components in our pipeline. We used both LlamaIndex and Langchain for orchestration, with LlamaIndex handling retrieval of augmented generation components, whereas Langchain was used for structured data extraction and connecting pipeline components.</p></sec><sec id="s2-5"><title>Answer Generation and Grading</title><p>Five junior clinicians were tasked to provide recommendations on PSA testing for each of the case scenarios. They included a first-year medical officer, a second-year family medicine resident, 2 second-year urology residents, and a third-year urology resident. Each clinician completed the task in a &#x201C;closed-book&#x201D; format, followed by an &#x201C;open-book&#x201D; format in which they were permitted to reference relevant material of their choice (eg, guidelines or textbooks). The time taken to complete the task in each format was recorded.</p><p>The RAG-LLM tool was likewise provided with the same set of fictional case scenarios and instructed to provide recommendations on PSA testing. We conducted 5 runs to assess the consistency of the LLM output. Answers were graded by the study team in a binomial format (correct or incorrect). Answers were marked as correct if they were concordant with either the EAU or AUA guidelines.</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>SPSS (version 26.0; IBM Corp) was used for the statistical analysis of quantitative data. Answers from the RAG-LLM tool and human comparators were compared using Student 2-tailed <italic>t</italic> test. Interrater agreement was calculated using Fleiss &#x03BA;.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>The RAG-LLM tool provided guideline-concordant recommendations in 95.5% (210/220) of case scenarios, compared to junior clinicians, who were correct in 62.3% (137/220) of scenarios in a closed-book format and 74.1% (163/220) of scenarios in an open-book format. The difference was statistically significant for both closed-book (<italic>P</italic>&#x003C;.001) and open-book (<italic>P</italic>&#x003C;.001) formats.</p><p>Cases were divided into screening (20/44, 45.5%) and follow-up (24/44, 54.5%) categories. The RAG-LLM tool provided an incorrect recommendation in 1 screening case: in all 5 instances, it failed to recommend a PSA test for a patient for whom screening was recommended. In comparison, junior clinicians missed 16/100 (16%) tests in the closed-book format and 11/100 (11%) in the open-book format. They also offered 14/100 (14%) unnecessary PSA tests in the closed-book format and 10/100 (10%) in the open-book format. For follow-up cases, the RAG-LLM tool provided an incorrect recommendation in 1 case: in all 5 instances, it incorrectly recommended a repeat PSA test for a patient with a normal PSA reading. In comparison, junior clinicians ordered 29/120 (24.2%) unnecessary tests in the closed-book format and 23/120 (19.2%) in the open-book format, and missed 24/120 (20%) tests and 13/120 (10.8%) tests in the closed-book and open-book formats, respectively. Overall, the RAG-LLM tool recommended 71 (5 vs 76, 93.4%) fewer unnecessary PSA tests than junior clinicians and missed 59 (5 vs 64, 92.2%) fewer PSA tests that should have been offered.</p><p>Results were further analyzed by the following categories of cases: (1) PSA screening recommended; (2) PSA screening not recommended; (3) follow-up of a normal PSA reading; (4) management or follow-up of an elevated PSA reading; and (5) others, including likely spuriously elevated PSA readings from concurrent urinary tract infections, elevated PSA readings in patients with significant comorbidity in whom further or repeat testing would be unlikely to be beneficial, and normal PSA readings in patients with an abnormal digital rectal examination. Results are detailed in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Accuracy and error breakdown of prostate-specific antigen (PSA) testing recommendations by retrieval-augmented generation&#x2013;large language model (RAG-LLM) and junior clinicians.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Group and category<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="bottom" colspan="3">Unnecessary tests, n (%)</td><td align="left" valign="bottom" colspan="3">Missed tests, n (%)</td><td align="left" valign="bottom">Total errors, n (%)</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Short interval</td><td align="left" valign="bottom">Did not require</td><td align="left" valign="bottom">Subtotal</td><td align="left" valign="bottom">Long interval</td><td align="left" valign="bottom">Failed to offer</td><td align="left" valign="bottom">Subtotal</td><td align="left" valign="bottom"/><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top" colspan="9">Overall (n=220)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM</td><td align="left" valign="top">5 (2.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (2.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (2.3)</td><td align="left" valign="top">5 (2.3)</td><td align="left" valign="top">10 (4.5)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, closed-book</td><td align="left" valign="top">11 (5.0)</td><td align="left" valign="top">32 (14.5)</td><td align="left" valign="top">43 (19.5)</td><td align="left" valign="top">26 (11.8)</td><td align="left" valign="top">14 (6.4)</td><td align="left" valign="top">40 (18.2)</td><td align="left" valign="top">83 (37.7)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, open-book</td><td align="left" valign="top">10 (4.5)</td><td align="left" valign="top">23 (10.5)</td><td align="left" valign="top">33 (15.0)</td><td align="left" valign="top">14 (6.4)</td><td align="left" valign="top">10 (4.5)</td><td align="left" valign="top">24 (10.9)</td><td align="left" valign="top">57 (25.9)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="9">Category 1: PSA screening recommended (n=55)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (9.1)</td><td align="left" valign="top">5 (9.1)</td><td align="left" valign="top">5 (9.1)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, closed-book</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (0)</td><td align="left" valign="top">10 (18.2)</td><td align="left" valign="top">13 (23.6)</td><td align="left" valign="top">13 (23.6)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, open-book</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (1.8)</td><td align="left" valign="top">10 (18.2)</td><td align="left" valign="top">11 (20)</td><td align="left" valign="top">11 (20)</td><td align="left" valign="top">.11</td></tr><tr><td align="left" valign="top" colspan="9">Category 2: PSA screening not recommended (n=45)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, closed-book</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">14 (31.1)</td><td align="left" valign="top">14 (31.1)</td><td align="left" valign="top">3 (6.7)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (6.7)</td><td align="left" valign="top">17 (37.8)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, open-book</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">10 (22.2)</td><td align="left" valign="top">10 (22.2)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">10 (22.2)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top" colspan="9">Category 3: normal PSA follow-up (n=45)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM</td><td align="left" valign="top">5 (11.1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (11.1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">5 (11.1)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, closed-book</td><td align="left" valign="top">8 (17.8)</td><td align="left" valign="top">8 (17.8)</td><td align="left" valign="top">16 (35.6)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">3 (6.7)</td><td align="left" valign="top">3 (6.7)</td><td align="left" valign="top">19 (42.2)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, open-book</td><td align="left" valign="top">7 (15.6)</td><td align="left" valign="top">7 (15.6)</td><td align="left" valign="top">14 (31.1)</td><td align="left" valign="top">1 (2.2)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (2.2)</td><td align="left" valign="top">15 (33.3)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top" colspan="9">Category 4: elevated PSA (n=40)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, closed-book</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">20 (50)</td><td align="left" valign="top">1 (2.5)</td><td align="left" valign="top">21 (52.5)</td><td align="left" valign="top">21 (52.5)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, open-book</td><td align="left" valign="top">1 (2.5)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">1 (2.5)</td><td align="left" valign="top">12 (30)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">12 (30)</td><td align="left" valign="top">13 (28.9)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="9">Category 5: others (n=35)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>LLM</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, closed-book</td><td align="left" valign="top">3 (8.6)</td><td align="left" valign="top">10 (28.6)</td><td align="left" valign="top">13 (37.1)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">13 (37.1)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Human, open-book</td><td align="left" valign="top">2 (5.7)</td><td align="left" valign="top">6 (17.1)</td><td align="left" valign="top">8 (22.9)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">8 (22.9)</td><td align="left" valign="top">.002</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>The denominators used for all percentage calculations represent the number of cases in each category multiplied by 5, as each of the 44 case scenarios was independently evaluated by 5 junior clinicians. Accordingly, the overall total is shown as n=220, and the denominators for each category (eg, n=55 for category 1, n=45 for category 2, etc) follow the same calculation method.</p></fn><fn id="table1fn2"><p><sup>b</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>Average time taken by clinicians to provide a recommendation was 23 seconds in the closed-book format and 28 seconds in an open-book format. In comparison, the RAG-LLM tool averaged 9.7 seconds per recommendation. Interrater agreements among clinicians for closed-book and open-book responses were Fleiss &#x03BA;=0.294 (95% CI 0.291&#x2010;0.297<italic>; P</italic>&#x003C;.001) and Fleiss &#x03BA;=0.321 (95% CI 0.318&#x2010;0.324<italic>; P</italic>&#x003C;.001), respectively, indicating fair agreement. In comparison, Fleiss &#x03BA; for RAG-LLM tool responses was 1.000 (95% CI 0.998&#x2010;1.000; <italic>P</italic>&#x003C;.001), indicating very good agreement.</p></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>To our knowledge, this is the first study in the field of urology demonstrating the efficacy of a RAG-LLM tool for clinical decision support. Augmenting LLMs with contextualized information has been shown in other health care domains to reduce instances of hallucination and increase accuracy [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>]. In this study, guideline-concordant recommendations were made in &#x003E;95% of scenarios by the RAG-LLM, as compared to the 60%-75% concordance by junior clinicians.</p><p>Examining responses that were not guideline concordant, we found that the errors made by the RAG-LLM arose from (1) the rule-based nature of the CCI calculator, which precluded a patient aged 72 years from PSA screening despite strong risk factors for prostate cancer and (2) erroneous interpretation of a normal PSA result as &#x201C;moderately elevated,&#x201D; triggering a reactive repeat PSA test, which in actuality was unnecessary. In contrast, the junior clinicians made errors across a broad range of categories, irrespective of seniority or training status.</p><p>Analysis of the incorrect recommendation given by the RAG-LLM was undertaken by examining the retrieved guideline chunks and the LLM output for each guideline, followed by the final recommendation. The scenario was that of a 55-year-old man who had been on follow-up for erectile dysfunction, with a PSA screening result of 2.8 ng/mL. The retrieved chunks for both AUA and EAU guidelines contained the information required to answer the clinical scenario.</p><p>With regard to the AUA guidelines, the RAG-LLM chain-of-thought process correctly identified an appropriate interval of &#x201C;regular PSA screening every 2 to 4 years for people aged 50 to 69 years,&#x201D; but wrongly reasoned that a PSA level of 2.8 ng/mL was elevated and thus recommended a repeat PSA test. As no text in the retrieved chunks suggested the classification of a PSA of 2.8 ng/mL as elevated, we classified this error as a hallucination. Conversely, for the EAU guidelines, contained within the same chunk were the phrases &#x201C;the most commonly applied threshold for PSA is &#x2265; 3.0 ng/ml&#x201D; and &#x201C;In case of a moderately elevated PSA (up to 10 ng/mL), a repeated test after a few weeks should be considered to confirm the increase.&#x201D; The RAG-LLM failed to synthesize these 2 pieces of information&#x2014;specifically, that a &#x201C;moderately elevated&#x201D; PSA would range between 3 and 10 ng/mL&#x2014;and interpreted the PSA of 2.8 ng/mL as moderately elevated. While it recognized the threshold by giving an output stating &#x201C;given the patient&#x2019;s age (55 years) and PSA level (2.8 ng/mL), he falls into a category where follow-up intervals of two years may be considered,&#x201D; it proceeded to reason that &#x201C;the reference context also suggests that in cases of moderately elevated PSA, a repeated test after a few weeks should be considered,&#x201D; thus recommending an unnecessary confirmatory repeat PSA test.</p><p>In case scenarios where EAU and AUA guidelines provided differing recommendations for PSA testing intervals, the RAG-LLM tool provided both recommendations. In comparison, junior clinicians generally selected a single guideline document as a reference. While not incorrect, their responses were thus qualitatively less comprehensive and thorough than those generated by the LLM tool.</p><p>Our study demonstrates that RAG-LLM tools have the potential to augment clinical decision-making by providing guideline-concordant recommendations in real time. While such a clinical task may be relatively simple for an experienced specialist, generalists or junior clinicians may not necessarily have similar familiarity and experience with specialist care. Such clinical decision support tools may prove useful in primary care settings or in care settings where it is practically challenging for a senior clinician to supervise every clinical decision due to time constraints and high patient volume. Patient-specific, guideline-based tools can potentially relieve cognitive burden, shorten learning curves, and improve decision-making time, thus improving overall consistency and efficiency of clinic consultations [<xref ref-type="bibr" rid="ref16">16</xref>]. Use of RAG-LLM tools as a method to improve guideline adherence can also be a strategy to minimize unnecessary investigations and specialist consultation, thereby reducing costs to patients and public health care systems. In the primary care setting, increased adherence to guidelines has been shown to improve the quality and appropriateness of specialist referrals [<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>From a technical standpoint, RAG-LLM tools are preferable to &#x201C;off-the-shelf&#x201D; LLMs. The use of LLMs in clinical medicine engenders concerns of hallucination and resulting inaccurate recommendations, with implications for patient care and safety. Incorporating RAG systems in LLM tools reduces the frequency of hallucinations [<xref ref-type="bibr" rid="ref18">18</xref>] and is more economical than fine-tuning or pretraining a model from the ground up.</p></sec><sec id="s4-2"><title>Limitations</title><p>We acknowledge some important limitations to this study, which fall into the clinical and technical domains. First, from a clinical perspective, this study used fictional case scenarios, rather than real clinical cases. While this may limit generalizability and external validation, it is arguably better to perform LLM evaluation on a well-curated set of varied case scenarios, rather than a sample from a general population that would be less likely to feature uncommon or complex cases [<xref ref-type="bibr" rid="ref19">19</xref>]. This is analogous to the assessment of junior clinicians, where ability would be assessed using a purposefully designed set of cases, rather than a general sample of common cases [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Future direction includes testing model robustness against retrospective and prospective real-world clinical cases.</p><p>A second clinical limitation is the use of the CCI as a tool to estimate 10-year life expectancy. Although the CCI is recommended in the EAU guidelines as a means of estimating life expectancy, it was created in 1987 and has certain limitations in modern practice, such as an incomplete list of comorbidities, assumptions that the effect of comorbidities is additive, and potentially lengthier disease prognoses with modern medical management [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. While comorbidity burden and a patient&#x2019;s remaining healthy lifespan are key determinants of benefit from any form of screening test, current scoring tools may not adequately capture the nuances of clinical practice and patient assessment and indeed rely on cohort measures of central tendency to estimate life expectancy. We thus envision that such clinical decision support tools would assist clinicians as copilots, maintaining a human-in-the-loop approach rather than functioning as autonomous decision-makers. Additionally, the tool design is modular and separates CCI determination and case analysis into sequential steps, allowing substitution of an alternative comorbidity calculator or omission of this step altogether at the clinician&#x2019;s discretion.</p><p>Third, from a technical perspective, although supplementing LLMs with RAG has been shown to reduce rates of AI hallucinations [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref24">24</xref>], these models are not entirely immune to hallucination. Our RAG-LLM tool provided incorrect recommendations in 1 scenario due to hallucination or faulty reasoning, but erred in a conservative direction, avoiding harms arising from a missed prostate cancer diagnosis. The source of error suggests that current textual documents may require a degree of unwritten human inference, which is not an intrinsic ability that LLMs possess. Identification of these problematic areas in text data and explicit definition of terms may improve reasoning and performance of LLM-based tools. The &#x201C;black box&#x201D; nature of many AI or AI-assisted tools [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>] may present difficulties in pinpointing errors in internal reasoning processes, but use of techniques such as prompt engineering and self-reflective RAG models may help to enhance the accuracy of these models [<xref ref-type="bibr" rid="ref27">27</xref>]. Variability in performance across different LLMs also needs to be taken into account and balanced against the cost of each model.</p></sec><sec id="s4-3"><title>Future Directions</title><p>Despite these limitations, RAG-LLM tools retain potential for multiple applications in health care. On the basis of the same system for clinical decision support for guideline-based recommendations, it can also be used retrospectively as an auditing tool to identify areas of guideline discordance in clinical practice [<xref ref-type="bibr" rid="ref28">28</xref>]. Furthermore, the RAG approach allows future guideline documents to be incorporated much more easily than a fine-tuning or pretraining approach, keeping the tool up-to-date and preventing obsolescence [<xref ref-type="bibr" rid="ref29">29</xref>]. Prospective real-world model validation based on clinical data, multimodel evaluation, implementation of explainability methods, and expansion of such RAG-LLM pipelines beyond PSA testing to other areas in urology are potential areas for further research.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In this simulation-based comparative evaluation, we developed a RAG-LLM tool to provide clinical decision support on PSA testing. The tool demonstrated high accuracy, outperforming junior clinicians in making efficient and guideline-concordant decisions. The use of such tools can help increase guideline adherence, improve patient care, and optimize the use of health care resources.</p></sec></sec></body><back><notes><sec><title>Funding</title><p>This study was supported by an academic medicine philanthropic fund (the Foo Keong Tatt Professorship in Urology) from the Singapore Health Services Duke-National University of Singapore (&#x201C;SingHealth Duke-NUS&#x201D;) Joint Office of Academic Medicine.</p></sec><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: JYMT, DYZL, GGRS</p><p>Data curation: JYMT, RSEL, YGT, KC, KJT</p><p>Formal analysis: QL, JY, YH, JHT</p><p>Funding acquisition: JSPY, CWSC</p><p>Methodology: JYMT, DYZL, GGRS, CWSC</p><p>Software: QL, JY, YH, JHT</p><p>Supervision: JHT, JSPY, CWSC, HSSH</p><p>Validation: KC, KJT</p><p>Visualization: QL</p><p>Writing &#x2013; original draft: JYMT, RSEL</p><p>Writing &#x2013; review and editing: JYMT, DYZL, GGRS, RSEL, KJT, JSPY, CWSC</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AUA</term><def><p> American Urological Association</p></def></def-item><def-item><term id="abb3">CCI</term><def><p> Charlson Comorbidity Index</p></def></def-item><def-item><term id="abb4">EAU</term><def><p> European Association of Urology</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">PSA</term><def><p>prostate-specific antigen</p></def></def-item><def-item><term id="abb7">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bray</surname><given-names>F</given-names> </name><name name-style="western"><surname>Laversanne</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Global cancer statistics 2022: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title><source>CA A Cancer J Clinicians</source><year>2024</year><month>05</month><volume>74</volume><issue>3</issue><fpage>229</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.3322/caac.21834</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Etzioni</surname><given-names>R</given-names> </name><name name-style="western"><surname>Penson</surname><given-names>DF</given-names> </name><name name-style="western"><surname>Legler</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>Overdiagnosis due to prostate-specific antigen screening: lessons from U.S. prostate cancer incidence trends</article-title><source>J Natl Cancer Inst</source><year>2002</year><month>07</month><day>3</day><volume>94</volume><issue>13</issue><fpage>981</fpage><lpage>990</lpage><pub-id pub-id-type="doi">10.1093/jnci/94.13.981</pub-id><pub-id pub-id-type="medline">12096083</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pinsky</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Parnes</surname><given-names>HL</given-names> </name><name name-style="western"><surname>Andriole</surname><given-names>G</given-names> </name></person-group><article-title>Mortality and complications after prostate biopsy in the Prostate, Lung, Colorectal and Ovarian cancer screening (PLCO) trial</article-title><source>BJU Int</source><year>2014</year><month>02</month><volume>113</volume><issue>2</issue><fpage>254</fpage><lpage>259</lpage><pub-id pub-id-type="doi">10.1111/bju.12368</pub-id><pub-id pub-id-type="medline">24053621</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cornford</surname><given-names>P</given-names> </name><name name-style="western"><surname>van den Bergh</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Briers</surname><given-names>E</given-names> </name><etal/></person-group><article-title>EAU-EANM-ESTRO-ESUR-ISUP-SIOG guidelines on prostate cancer-2024 update. Part I: screening, diagnosis, and local treatment with curative intent</article-title><source>Eur Urol</source><year>2024</year><month>08</month><volume>86</volume><issue>2</issue><fpage>148</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.eururo.2024.03.027</pub-id><pub-id pub-id-type="medline">38614820</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Barocas</surname><given-names>D</given-names> </name><name name-style="western"><surname>Carlsson</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Early detection of prostate cancer: AUA/SUO guideline part I: prostate cancer screening</article-title><source>J Urol</source><year>2023</year><month>07</month><volume>210</volume><issue>1</issue><fpage>46</fpage><lpage>53</lpage><pub-id pub-id-type="doi">10.1097/JU.0000000000003491</pub-id><pub-id pub-id-type="medline">37096582</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kalavacherla</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riviere</surname><given-names>P</given-names> </name><name name-style="western"><surname>Javier-DesLoges</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Low-value prostate-specific antigen screening in older males</article-title><source>JAMA Netw Open</source><year>2023</year><month>04</month><day>3</day><volume>6</volume><issue>4</issue><fpage>e237504</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.7504</pub-id><pub-id pub-id-type="medline">37040113</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>1</fpage><lpage>8</lpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Enhancement of the performance of large language models in diabetes education through retrieval-augmented generation: comparative study</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>8</day><volume>26</volume><issue>1</issue><fpage>e58041</fpage><pub-id pub-id-type="doi">10.2196/58041</pub-id><pub-id pub-id-type="medline">39046096</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>McCoy</surname><given-names>AB</given-names> </name><name name-style="western"><surname>Wright</surname><given-names>A</given-names> </name></person-group><article-title>Improving large language model applications in biomedicine with retrieval-augmented generation: a systematic review, meta-analysis, and clinical development guidelines</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>04</month><day>1</day><volume>32</volume><issue>4</issue><fpage>605</fpage><lpage>615</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf008</pub-id><pub-id pub-id-type="medline">39812777</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>W</given-names> </name><name name-style="western"><surname>Piccardi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name></person-group><article-title>Empowering large language models for automated clinical assessment with generation-augmented retrieval and hierarchical chain-of-thought</article-title><source>Artif Intell Med</source><year>2025</year><month>04</month><volume>162</volume><issue>103078</issue><fpage>103078</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2025.103078</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Charlson</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Pompei</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ales</surname><given-names>KL</given-names> </name><name name-style="western"><surname>MacKenzie</surname><given-names>CR</given-names> </name></person-group><article-title>A new method of classifying prognostic comorbidity in longitudinal studies: development and validation</article-title><source>J Chronic Dis</source><year>1987</year><volume>40</volume><issue>5</issue><fpage>373</fpage><lpage>383</lpage><pub-id pub-id-type="doi">10.1016/0021-9681(87)90171-8</pub-id><pub-id pub-id-type="medline">3558716</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>Death and life expectancy</article-title><source>Statistics Singapore</source><access-date>2024-09-30</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://www.singstat.gov.sg/find-data/search-by-theme/population/death-and-life-expectancy/latest-data">http://www.singstat.gov.sg/find-data/search-by-theme/population/death-and-life-expectancy/latest-data</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Koyejo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mohamed</surname><given-names>S</given-names> </name><name name-style="western"><surname>Agarwal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Belgrave</surname><given-names>D</given-names> </name></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><source>NIPS &#x2019;22: Proceedings of the 36th International Conference on Neural Information Processing Systems</source><year>2024</year><publisher-name>Curran Associates Inc</publisher-name><fpage>24824</fpage><lpage>24837</lpage><pub-id pub-id-type="other">9781713871088</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>DYZ</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>YB</given-names> </name><name name-style="western"><surname>Koh</surname><given-names>JTE</given-names> </name><etal/></person-group><article-title>ChatGPT on guidelines: providing contextual knowledge to GPT allows it to provide advice on appropriate colonoscopy intervals</article-title><source>J Gastroenterol Hepatol</source><year>2024</year><month>01</month><volume>39</volume><issue>1</issue><fpage>81</fpage><lpage>106</lpage><pub-id pub-id-type="doi">10.1111/jgh.16375</pub-id><pub-id pub-id-type="medline">37855067</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ge</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>S</given-names> </name><name name-style="western"><surname>Owens</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Development of a liver disease-specific large language model chat interface using retrieval augmented generation</article-title><source>Gastroenterology</source><year>2023</year><month>11</month><day>10</day><pub-id pub-id-type="doi">10.1101/2023.11.10.23298364</pub-id><pub-id pub-id-type="medline">38451962</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Harnessing the power of clinical decision support systems: challenges and opportunities</article-title><source>Open Heart</source><year>2023</year><month>11</month><volume>10</volume><issue>2</issue><fpage>e002432</fpage><pub-id pub-id-type="doi">10.1136/openhrt-2023-002432</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Blank</surname><given-names>L</given-names> </name><name name-style="western"><surname>Baxter</surname><given-names>S</given-names> </name><name name-style="western"><surname>Woods</surname><given-names>HB</given-names> </name><etal/></person-group><article-title>Referral interventions from primary to specialist care: a systematic review of international evidence</article-title><source>Br J Gen Pract</source><year>2014</year><month>12</month><volume>64</volume><issue>629</issue><fpage>e765</fpage><lpage>e774</lpage><pub-id pub-id-type="doi">10.3399/bjgp14X682837</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kather</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Hogan</surname><given-names>A</given-names> </name></person-group><article-title>Augmented non-hallucinating large language models as medical information curators</article-title><source>NPJ Digit Med</source><year>2024</year><month>04</month><day>23</day><volume>7</volume><issue>1</issue><fpage>100</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01081-0</pub-id><pub-id pub-id-type="medline">38654142</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ye</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name></person-group><article-title>The benefits of using atypical presentations and rare diseases in problem-based learning in undergraduate medical education</article-title><source>BMC Med Educ</source><year>2023</year><month>02</month><day>6</day><volume>23</volume><issue>1</issue><fpage>93</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04079-6</pub-id><pub-id pub-id-type="medline">36747223</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Surry</surname><given-names>LT</given-names> </name><name name-style="western"><surname>Torre</surname><given-names>D</given-names> </name><name name-style="western"><surname>Durning</surname><given-names>SJ</given-names> </name></person-group><article-title>Exploring examinee behaviours as validity evidence for multiple-choice question examinations</article-title><source>Med Educ</source><year>2017</year><month>10</month><volume>51</volume><issue>10</issue><fpage>1075</fpage><lpage>1085</lpage><pub-id pub-id-type="doi">10.1111/medu.13367</pub-id><pub-id pub-id-type="medline">28758233</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ilgen</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Bowen</surname><given-names>JL</given-names> </name><name name-style="western"><surname>McIntyre</surname><given-names>LA</given-names> </name><etal/></person-group><article-title>Comparing diagnostic performance and the utility of clinical vignette-based assessment under testing conditions designed to encourage either automatic or analytic thought</article-title><source>Acad Med</source><year>2013</year><month>10</month><volume>88</volume><issue>10</issue><fpage>1545</fpage><lpage>1551</lpage><pub-id pub-id-type="doi">10.1097/ACM.0b013e3182a31c1e</pub-id><pub-id pub-id-type="medline">23969355</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drosdowsky</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gough</surname><given-names>K</given-names> </name></person-group><article-title>The Charlson Comorbidity Index: problems with use in epidemiological research</article-title><source>J Clin Epidemiol</source><year>2022</year><month>08</month><volume>148</volume><fpage>174</fpage><lpage>177</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2022.03.022</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Charlson</surname><given-names>ME</given-names> </name><name name-style="western"><surname>Carrozzino</surname><given-names>D</given-names> </name><name name-style="western"><surname>Guidi</surname><given-names>J</given-names> </name><name name-style="western"><surname>Patierno</surname><given-names>C</given-names> </name></person-group><article-title>Charlson comorbidity index: a critical review of clinimetric properties</article-title><source>Psychother Psychosom</source><year>2022</year><volume>91</volume><issue>1</issue><fpage>8</fpage><lpage>35</lpage><pub-id pub-id-type="doi">10.1159/000521288</pub-id><pub-id pub-id-type="medline">34991091</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>An</surname><given-names>R</given-names> </name></person-group><article-title>Use of retrieval-augmented large language model for COVID-19 fact-checking: development and usability study</article-title><source>J Med Internet Res</source><year>2025</year><volume>27</volume><issue>1</issue><fpage>e66098</fpage><pub-id pub-id-type="doi">10.2196/66098</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>London</surname><given-names>AJ</given-names> </name></person-group><article-title>Artificial intelligence and black-box medical decisions: accuracy versus explainability</article-title><source>Hastings Cent Rep</source><year>2019</year><month>01</month><volume>49</volume><issue>1</issue><fpage>15</fpage><lpage>21</lpage><pub-id pub-id-type="doi">10.1002/hast.973</pub-id><pub-id pub-id-type="medline">30790315</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Starke</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gille</surname><given-names>F</given-names> </name><name name-style="western"><surname>Termine</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Finding consensus on trust in AI in health care: recommendations from a panel of international experts</article-title><source>J Med Internet Res</source><year>2025</year><month>02</month><day>19</day><volume>27</volume><fpage>e56306</fpage><pub-id pub-id-type="doi">10.2196/56306</pub-id><pub-id pub-id-type="medline">39969962</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sohn</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kang</surname><given-names>J</given-names> </name></person-group><article-title>Improving medical reasoning through retrieval and self-reflection with retrieval-augmented large language models</article-title><source>Bioinformatics</source><year>2024</year><month>06</month><day>28</day><volume>40</volume><issue>Suppl 1</issue><fpage>i119</fpage><lpage>i129</lpage><pub-id pub-id-type="doi">10.1093/bioinformatics/btae238</pub-id><pub-id pub-id-type="medline">38940167</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>R</given-names> </name><name name-style="western"><surname>Cook</surname><given-names>B</given-names> </name><name name-style="western"><surname>Stretton</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Large language models can effectively extract stroke and reperfusion audit data from medical free-text discharge summaries</article-title><source>J Clin Neurosci</source><year>2024</year><month>11</month><volume>129</volume><fpage>110847</fpage><pub-id pub-id-type="doi">10.1016/j.jocn.2024.110847</pub-id><pub-id pub-id-type="medline">39305548</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Suppadungsuk</surname><given-names>S</given-names> </name><name name-style="western"><surname>Garcia Valencia</surname><given-names>OA</given-names> </name><name name-style="western"><surname>Cheungpasitporn</surname><given-names>W</given-names> </name></person-group><article-title>Integrating retrieval-augmented generation with large language models in nephrology: advancing practical applications</article-title><source>Med Bogota Colomb</source><year>2024</year><month>03</month><day>8</day><volume>60</volume><issue>3</issue><fpage>445</fpage><pub-id pub-id-type="doi">10.3390/medicina60030445</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials, including the design and technical elements of the RAG-LLM tool. LLM: large language model; RAG: retrieval-augmented generation.</p><media xlink:href="jmir_v27i1e78393_app1.docx" xlink:title="DOCX File, 868 KB"/></supplementary-material></app-group></back></article>