<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e58041</article-id>
      <article-id pub-id-type="pmid">39046096</article-id>
      <article-id pub-id-type="doi">10.2196/58041</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Enhancement of the Performance of Large Language Models in Diabetes Education through Retrieval-Augmented Generation: Comparative Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Eysenbach</surname>
            <given-names>Gunther</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhang</surname>
            <given-names>Guangyuan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mistry</surname>
            <given-names>Jinal</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zhu</surname>
            <given-names>Yuanda</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Dingqiao</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-2648-6677</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Liang</surname>
            <given-names>Jiangbo</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-6290-5748</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Ye</surname>
            <given-names>Jinguo</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-7440-4100</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Jingni</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-4716-1532</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Jingpeng</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0003-9535-189X</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Zhang</surname>
            <given-names>Qikai</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-3183-3200</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Hu</surname>
            <given-names>Qiuling</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-5627-776X</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Pan</surname>
            <given-names>Caineng</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7988-5363</ext-link>
        </contrib>
        <contrib id="contrib9" contrib-type="author">
          <name name-style="western">
            <surname>Wang</surname>
            <given-names>Dongliang</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0605-5726</ext-link>
        </contrib>
        <contrib id="contrib10" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Zhong</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9209-4179</ext-link>
        </contrib>
        <contrib id="contrib11" contrib-type="author">
          <name name-style="western">
            <surname>Shi</surname>
            <given-names>Wen</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0006-5084-5265</ext-link>
        </contrib>
        <contrib id="contrib12" contrib-type="author">
          <name name-style="western">
            <surname>Shi</surname>
            <given-names>Danli</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6094-137X</ext-link>
        </contrib>
        <contrib id="contrib13" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Li</surname>
            <given-names>Fei</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-6211-6306</ext-link>
        </contrib>
        <contrib id="contrib14" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Qu</surname>
            <given-names>Bo</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0007-7985-9473</ext-link>
        </contrib>
        <contrib id="contrib15" contrib-type="author" corresp="yes" equal-contrib="yes">
          <name name-style="western">
            <surname>Zheng</surname>
            <given-names>Yingfeng</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>State Key Laboratory of Ophthalmology, Zhongshan Ophthalmic Center, Sun Yat-sen University</institution>
            <institution>Guangdong Provincial Key Laboratory of  Ophthalmology and Visual Science</institution>
            <institution>Guangdong Provincial Clinical Research Center for Ocular Diseases</institution>
            <addr-line>State Key Laboratory of Ophthalmology, Zhongshan Ophthalmic Center</addr-line>
            <addr-line>Sun Yat-sen University, 07 Jinsui Road</addr-line>
            <addr-line>GuangZhou, 510060</addr-line>
            <country>China</country>
            <phone>86 139 2228 6455</phone>
            <email>zhyfeng@mail.sysu.edu.cn</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9952-6445</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>State Key Laboratory of Ophthalmology, Zhongshan Ophthalmic Center, Sun Yat-sen University</institution>
        <institution>Guangdong Provincial Key Laboratory of  Ophthalmology and Visual Science</institution>
        <institution>Guangdong Provincial Clinical Research Center for Ocular Diseases</institution>
        <addr-line>GuangZhou</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Research Centre for SHARP Vision</institution>
        <institution>The Hong Kong Polytechnic University</institution>
        <addr-line>Hong Kong</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Peking University Third Hospital</institution>
        <addr-line>Beijing</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yingfeng Zheng <email>zhyfeng@mail.sysu.edu.cn</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>8</day>
        <month>11</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e58041</elocation-id>
      <history>
        <date date-type="received">
          <day>4</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>1</day>
          <month>5</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>3</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>15</day>
          <month>7</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Dingqiao Wang, Jiangbo Liang, Jinguo Ye, Jingni Li, Jingpeng Li, Qikai Zhang, Qiuling Hu, Caineng Pan, Dongliang Wang, Zhong Liu, Wen Shi, Danli Shi, Fei Li, Bo Qu, Yingfeng Zheng. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 08.11.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e58041" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) demonstrated advanced performance in processing clinical information. However, commercially available LLMs lack specialized medical knowledge and remain susceptible to generating inaccurate information. Given the need for self-management in diabetes, patients commonly seek information online. We introduce the Retrieval-augmented Information System for Enhancement (RISE) framework and evaluate its performance in enhancing LLMs to provide accurate responses to diabetes-related inquiries.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aimed to evaluate the potential of the RISE framework, an information retrieval and augmentation tool, to improve the LLM’s performance to accurately and safely respond to diabetes-related inquiries.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>The RISE, an innovative retrieval augmentation framework, comprises 4 steps: rewriting query, information retrieval, summarization, and execution. Using a set of 43 common diabetes-related questions, we evaluated 3 base LLMs (GPT-4, Anthropic Claude 2, Google Bard) and their RISE-enhanced versions respectively. Assessments were conducted by clinicians for accuracy and comprehensiveness and by patients for understandability.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The integration of RISE significantly improved the accuracy and comprehensiveness of responses from all 3 base LLMs. On average, the percentage of accurate responses increased by 12% (15/129) with RISE. Specifically, the rates of accurate responses increased by 7% (3/43) for GPT-4, 19% (8/43) for Claude 2, and 9% (4/43) for Google Bard. The framework also enhanced response comprehensiveness, with mean scores improving by 0.44 (SD 0.10). Understandability was also enhanced by 0.19 (SD 0.13) on average. Data collection was conducted from September 30, 2023 to February 5, 2024.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>The RISE significantly improves LLMs’ performance in responding to diabetes-related inquiries, enhancing accuracy, comprehensiveness, and understandability. These improvements have crucial implications for RISE’s future role in patient education and chronic illness self-management, which contributes to relieving medical resource pressures and raising public awareness of medical knowledge.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language models</kwd>
        <kwd>LLMs</kwd>
        <kwd>retrieval-augmented generation</kwd>
        <kwd>RAG</kwd>
        <kwd>GPT-4.0</kwd>
        <kwd>Claude-2</kwd>
        <kwd>Google Bard</kwd>
        <kwd>diabetes education</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>Diabetes mellitus is a chronic long-term illness that requires continual health education and assistance to improve patient outcomes [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. The shortage of diabetes counselors and the limitations of traditional education methods make it challenging to address the unique requirements of each diabetic patient [<xref ref-type="bibr" rid="ref3">3</xref>]. Large language models (LLMs), such as ChatGPT, hold significant promise in diabetes self-management and information assessment [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref8">8</xref>]. However, concerns exist around the accuracy and reliability of these models, mainly stemming from the variable credibility of their training data which is sourced from a wide variety of internet text and self-supervised learning [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. Furthermore, LLMs may lack domain-specific knowledge, risking the production of potentially inaccurate responses [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>Recent studies have primarily assessed the capabilities of LLMs in responding to diabetes-related questions, revealing limitations in their expertise in medical specialties, which remain unresolved. For example, Meo et al [<xref ref-type="bibr" rid="ref16">16</xref>] indicated both ChatGPT and Google Bard scored below 60% in endocrinology and diabetes. They concluded that while these artificial intelligence tools show potential in academic medical education, they require more updated information in these specific medical fields. Goodman Rachel et al [<xref ref-type="bibr" rid="ref17">17</xref>] also highlighted the precision of chatbots in medical queries and underlined the need for further research and model development for enhanced accuracy and validation in clinical practice. Hulman et al [<xref ref-type="bibr" rid="ref18">18</xref>] showed that ChatGPT-generated responses could be distinguished from expert responses by 59.5%, suggesting a gap compared with expert human performance. Therefore, addressing these gaps by augmenting LLMs with more specialized knowledge and updated information is crucial for improving their role in patient understanding and management of diabetes.</p>
        <p>In response to these unresolved challenges, our study introduces “RISE” (Retrieval-augmented Information System for Enhancement), an independent workflow designed to enhance the performance of LLMs in the medical domain by automatically retrieving real-time external knowledge. We used LLMs with and without RISE to answer diabetes-related inquiries from patients, assessing the improvements that RISE brings to the original LLMs in terms of accuracy, comprehensiveness, and understandability. Our RISE aims to bridge the knowledge gaps identified in LLMs, providing a more robust and reliable tool for addressing patient concerns about diabetes management and understanding.</p>
        <p>The main contributions of our work are (1) we introduce RISE, an innovative framework based on the retrieval-augmented generation (RAG) algorithm that enhances LLMs with real-time, domain-specific knowledge to provide accurate and comprehensive responses to diabetes-related inquiries, improving patient self-management and outcomes. (2) We reduce the risk of inaccurate or irrelevant responses from LLMs by integrating local and external real-time information retrieval, enhancing model transparency by identifying source information. (3) We incorporate an additional module for accuracy and safety checks before responding, ensuring that the provided information is reliable and free from harmful content. (4) We validate the RISE framework through assessments by clinicians and patients to demonstrate the feasibility of adopting RISE-enhanced LLMs in diabetes management and education.</p>
      </sec>
      <sec>
        <title>Related Works</title>
        <sec>
          <title>Large Language Models</title>
          <p>LLMs, such as GPT-3 [<xref ref-type="bibr" rid="ref19">19</xref>], GPT-4 [<xref ref-type="bibr" rid="ref20">20</xref>], and PaLM [<xref ref-type="bibr" rid="ref21">21</xref>], have garnered significant attention due to their exceptional language understanding and generation capabilities [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. However, when applied to domain-specific tasks, particularly in the medical field, their performance may be limited by a lack of specialized knowledge and vocabulary in the training data [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. Adapting LLMs for biomedical applications presents several challenges, including insufficient domain knowledge and high computational costs. As a result, only a few LLMs have been fine-tuned for medical consultation using open-source models with 6.5-13 billion parameters, such as ChatDoctor [<xref ref-type="bibr" rid="ref27">27</xref>] and MedAlpaca [<xref ref-type="bibr" rid="ref28">28</xref>]. However, this approach of fine-tuning open-source models has its limitations. Medical domain-specific models often use relatively smaller-scale LLMs (eg, LLaMA [<xref ref-type="bibr" rid="ref27">27</xref>] with 7B parameters), which may result in lower accuracy and robustness, compared with GPT-4 [<xref ref-type="bibr" rid="ref29">29</xref>]. Moreover, fine-tuning even these smaller LLMs is computationally intensive and costly [<xref ref-type="bibr" rid="ref27">27</xref>]. The introduction of new knowledge requires complete retraining of the model, placing additional burdens on developers. Furthermore, LLMs are generally prone to hallucination, which is a challenge that fine-tuning struggles to address [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref33">33</xref>].</p>
        </sec>
        <sec>
          <title>Retrieval-Augmented Generation in Medical Questions and Answers</title>
          <p>Recent studies have explored the application of RAG [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>] in the medical domain to enhance the performance of LLMs in question-answering tasks. These approaches enable LLMs to achieve improved performance without needing time-intensive and costly fine-tuning while facilitating timely updates without retraining the entire model.</p>
          <p>In specialized medical domains, LLMs have been augmented with limited medical corpora to address specific areas such as liver diseases (LiVersa) [<xref ref-type="bibr" rid="ref36">36</xref>], diffuse large B-cell lymphoma [<xref ref-type="bibr" rid="ref32">32</xref>], and nephrology [<xref ref-type="bibr" rid="ref37">37</xref>]. Simultaneously, in the general medical context, frameworks such as Almanac [<xref ref-type="bibr" rid="ref38">38</xref>] and RECTIFIER [<xref ref-type="bibr" rid="ref39">39</xref>] have been proposed to integrate LLMs with medical guidelines and treatment recommendations.</p>
          <p>Despite their potential, these approaches also present several limitations. The effectiveness of RAG-based models largely depends on the quality and currency of the used data sources. The previous studies typically rely on fixed and related smaller knowledge bases, such as Wikipedia or guidance documents, thereby limiting their effectiveness in specialized medical domains [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. Outdated or incorrect information can result in inaccurate or misleading outputs. Furthermore, retrieval errors or the inclusion of biased and unsafe content inherent in the LLMs, without further filtering, may lead to inaccuracies in the generated output, potentially misleading patients.</p>
          <p>Our RISE framework addresses these limitations by comminating with local and internet-based knowledge sources, curated from over 200 reputable academic websites, ensuring access to a wide range of up-to-date clinical evidence. Moreover, we incorporate additional fact-checking and safety check modules before responding. By prioritizing the accuracy and safety of the retrieved information, our framework offers a more reliable and secure pathway for answering clinical questions, significantly reducing the risk of misleading patients.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Framework of Retrieval-Augmented Information System for Enhancement</title>
        <p>Our study introduced the RISE framework, an innovative approach designed to improve the performance of medical question answering of LLMs. Our novel algorithm derives from RAG [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], which retrieves pertinent information from local databases or external knowledge from academic websites. Our RISE is a standalone framework comprising four steps (<xref rid="figure1" ref-type="fig">Figure 1</xref> and <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Comparing responses of LLMs before and after “RISE” integration. Red bars: response from base LLMs without RISE framework. Blue bars: overview of RISE framework and query response after integration with RISE. The framework of RISE: (1) Rewriting query: Improve query accuracy and relevance using large language models. (2) Information retrieval: Search for relevant information using the revised query from the local data set and external knowledge base. (3) Summarization: Summarize retrieved information into concise key points, combined with fact-checking and safety checks. (4) Execution: LLMs take action based on summarized information (for implementation details, refer to Multimedia Appendix 1). LLMs: large language models; RISE: Retrieval-augmented Information System for Enhancement.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e58041_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Rewriting Query</title>
        <p>The first step in the RISE framework involves rewriting the original query using advanced LLMs, including GPT-4, Claude 2, and Google Bard (Alphabet Inc; subsequently rebranded as Gemini). This process aims to enhance the query by correcting spelling errors, expanding abbreviations, and incorporating synonyms, thereby broadening the scope of potential results.</p>
      </sec>
      <sec>
        <title>Information Retrieval</title>
        <p>Relevant information is retrieved from a local vectorized database and external knowledge sources. The rewritten query is embedded in the same vector space as the database, and the Facebook AI Similarity Search (FAISS) algorithm is used for similarity search to find the top 5 most pertinent documents (retriever=vectorstore.as_retriever (search_type=“similarity,” search_kwargs={“k:” 5}), results = retriever.invoke [query]). If no results are found locally, external knowledge is sourced from academic websites (over 200), ensuring that all information adheres to stringent academic and research standards.</p>
      </sec>
      <sec>
        <title>Summarization</title>
        <p>The third step involves summarizing the retrieved information into a concise and understandable format by prompt. This step also includes fact-checking and safety checks to ensure accuracy and reduce harmful content.</p>
        <p>Fact-checking is performed in 2 steps. First, the retrieved raw text and the question are input, and the retrieved text is broken down into multiple claims. Second, these claims and the question are input, allowing the model to self-check which claims are confirmed using external knowledge sources. The model then returns the verified claims as the final summarization text.</p>
        <p>The safety check process uses a set of 24 rules to restrict and filter the content, ensuring the generated responses are safe and appropriate. The model is prompted with the instruction, that is “Your answer must adhere to the following rules: {rules}.”</p>
      </sec>
      <sec>
        <title>Execution</title>
        <p>The final step involves presenting the summarized information and prompts to the LLMs to generate the final answer for the user. The prompt instructs “Use the following pieces of context to answer the question at the end. Note that your response should be as brief as possible and no more than 300 words. If you don’t know the answer, just say that you don’t know, don’t try to make up an answer.”</p>
      </sec>
      <sec>
        <title>Local Database</title>
        <p>A local database of diabetes-related information was created to provide domain-specific knowledge for the RISE framework. PubMed Central [<xref ref-type="bibr" rid="ref44">44</xref>] was used to acquire a corpus of scientific papers and clinical practice guidelines relevant to diabetes. The database covers various aspects of diabetes, including pathophysiology, diagnosis, treatment, management, and patient education, rather than answering specific questions used in the evaluation. The retrieved documents comprise over 600 full-text articles.</p>
        <p>The retrieved documents were then preprocessed to remove potentially unstructured or noisy information, such as figures, tables, references, and author disclosures. After cleaning each document, the CharacterTextSplitter function from Langchain was used to divide the documents into smaller fragments. We then used the OpenAI model Text-Embedding-ADA-002 as an embedding function to generate embeddings for each fragment in FAISS using the function “db=FAISS.from_documents (docs, embeddings),” where “docs” refers to the document fragments and “embeddings” refers to the Text-Embedding-ADA-002 model. The resulting index was saved locally for continuous access and retrieval using the function “db.save_local (“faiss_index”).”</p>
        <p>When a user submitted a question, the rewriting query was transformed into an embedding vector and compared with the database embeddings using cosine similarity. The top k=5 document segments with the highest similarity scores were retrieved and used as the knowledge context for the user’s query. A sample of the data set and related code are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>.</p>
      </sec>
      <sec>
        <title>Study Design</title>
        <sec>
          <title>Overview</title>
          <p>This study was conducted at Sun Yat-sen University from September 25, 2023, to February 30, 2024. The 43 diabetes-related questions were selected from the National Institute of Diabetes and Digestive and Kidney Diseases website [<xref ref-type="bibr" rid="ref45">45</xref>] across the following 5 domains, that are concepts of diabetes, symptoms and causes, diabetes tests and diagnosis, managing diabetes, and prevention. The questions aimed to cover topics commonly asked by the public and patients regarding diabetes care.</p>
        </sec>
        <sec>
          <title>Respond Generation</title>
          <p>We prepared a set of 43 diabetes-related inquiries to be posed to 3 base language models - GPT-4, Claude 2, and Google Bard - as well as their respective versions enhanced by RISE. In total, there were 6 models involved, with an enhanced version corresponding to each base model. From September 30, 2023, to February 5, 2024, we independently fed the entire set of 43 inquiries into each of the 6 models, treating each question as a separate input and resetting the conversation between queries to minimize bias.</p>
          <p>Model responses were evaluated in a blinded, randomized manner through 2 aspects - first by clinician assessment focusing on accuracy and comprehensiveness, and then by patient evaluation of understandability. The evaluation process involved clinicians with over 5 years of experience in general medicine. The responses from all 6 models were shuffled randomly into 6 different rounds. To remove potential model indicators from responses, they were transformed into plain text before being distributed to 3 clinicians and 3 diabetes patients. They all analyzed responses across 6 rounds spaced 48 hours apart to eliminate confounding (<xref rid="figure2" ref-type="fig">Figure 2</xref>). Responses for each model and raw scores for evaluation are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Flowchart of overall study design. The study evaluates the performance of 3 publicly available large language models and their RISE-enhanced versions in addressing common diabetes-related inquiries. The evaluation is conducted from the perspectives of both the clinicians and diabetic patients. Clinicians evaluate the accuracy and comprehensiveness of responses. Patients assess the understandability.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e58041_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>Accuracy Evaluation</title>
        <p>We conducted an accuracy evaluation for each response by assigning scores and ratings. A “Poor” rating received 1 point, “Borderline” received 2 points, and “Good” received 3 points. Each response underwent assessment by 3 clinicians. For scoring, the score for each question is the sum of the score assigned by the 3 graders, with a maximum possible score of 9. For rating, we used a majority consensus method among the 3 clinicians. A response was considered “Good” only if more than 2 clinicians rated it as such. In cases where the 3 clinicians provided differing ratings, we implemented a stringent strategy by giving the response the lowest mark (ie, “poor”). The accuracy rate is defined as the proportion of responses with a final rating of “Good.”</p>
        <p>The accuracy scoring criteria include (1) “Poor” indicating replies containing mistakes that might considerably mislead patients and potentially result in damage, (2) “Borderline” assigned to answers with potential inaccuracies but unlikely to misguide or damage patients, and (3) “Good” reserved for replies without errors.</p>
      </sec>
      <sec>
        <title>Comprehensiveness Evaluation</title>
        <p>For responses that obtained a “good” rating by majority consensus, the clinicians further evaluated the comprehensiveness of each response. A 5-point scale was used (1) “not comprehensive” for reactions critically missing information (1 point), (2) “slightly comprehensive” for replies with limited but primary details (2 points), (3) “moderately comprehensive” for reactions providing more than half of the essential information (3 points), (4) “comprehensive” for reactions covering most critical points (4 points), and (5) “very comprehensive” for reactions giving comprehensive information (5 points). For each response, the average score was calculated by the mean of the scores assigned by the 3 clinicians.</p>
      </sec>
      <sec>
        <title>Understandability Evaluation</title>
        <p>A total of 3 diabetic patients evaluated response understandability. A 5-point scale different from comprehensiveness evaluation was used; (1) “very poor” for responses difficult to understand or completely irrelevant (1 point), (2) “poor” for responses somewhat difficult to understand or partially irrelevant (2 points), (3) “average” for responses generally understandable but requiring some effort or having minor ambiguities (3 points), (4) “good” for responses most of which are easily understandable with very few unclear parts (4 points), and (5) “Excellent” for responses very clear and easy to understand, fully meeting the reader’s needs (5 points). For each response, the average score for understandability was calculated based on the score given by each patient.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Statistical analyses were used SPSS (version 22.0; IBM Corp). Normal distribution was assessed with Kolmogorov–Smirnov test. Our data were found not to follow a normal distribution, <italic>P</italic>&#60;.001. Group comparisons used the Wilcoxon signed-rank test for accuracy, comprehensiveness, and understandability scores with and without RISE. For the comparison of the proportions of “good,” “borderline,” and “poor” ratings across the models, the chi-square test was used. <italic>P</italic> values &#60;.05 were regarded as significant.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study involved publicly available data without collecting human or animal samples and data. The study has been approved by the ethics committee of Zhongshan Ophthalmic Center (ZOC, Guangzhou, China; approval number 2024KYPJ124).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Accuracy Evaluation</title>
        <p>We evaluated 3 LLMs and their RISE-enhanced versions for answering diabetes-related questions. As shown in <xref rid="figure3" ref-type="fig">Figure 3</xref>, the average accuracy scores of all 3 original models increased substantially with the RISE enhancement. Specifically, the accuracy scores improved from 8.72 (SD 0.70) to 8.91 (SD 0.37; <italic>P</italic>=.09) for GPT-4 after applying RISE, 8.09 (SD 1.23) to 8.65 (SD 0.65; <italic>P</italic>=.001) for Claude, and 8.37 (SD 1.36) to 8.86 (SD 0.47; <italic>P</italic>=.01) for Bard (maximum score per response is 9 points).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Average scores of responses from large language models. Answers from each model were scored 1-3 points by 3 clinicians. The maximum score for each response is 9 points. An asterisk (*) denotes statistical significance at <italic>P</italic>&#60;.05. Model call dates: September 30, 2023 to February 5, 2024. RISE: Retrieval-augmented Information System for Enhancement.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e58041_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>We further evaluated the percentage rated as “Good”, representing accuracy rates, of the LLMs with and without RISE (<xref rid="figure4" ref-type="fig">Figure 4</xref>). The results showed increased accuracy rates after incorporating RISE across all original models. Specifically, after the incorporation of RISE, the proportion of accuracy responses for GPT-4 increased from 91% (39/43) to 98% (42/43), for Claude from 72% (31/43) to 91% (39/43), and for Bard from 86% (37/43) to 95% (41/43). Furthermore, GPT-4 enhanced by RISE exhibited the highest accuracy rates, reaching 98% (42/43). In addition, <xref ref-type="table" rid="table1">Table 1</xref> presents the accuracy of the models across 5 domains. All 6 models achieved the highest accuracy, reaching 100% (16/16), in the “Preventing Diabetes Problems” domain. However, in the “Concepts of Diabetes” and “Symptoms &#38; Causes” domains, the models exhibited relatively lower average accuracy rates.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Accuracy rates (proportion of “Good” responses) of large language models. Model call dates: September 30, 2023 to February 5, 2024. RISE: Retrieval-augmented Information System for Enhancement.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e58041_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Accuracy of model response across 5 diabetes educational domains.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="240"/>
            <col width="90"/>
            <col width="140"/>
            <col width="110"/>
            <col width="110"/>
            <col width="110"/>
            <col width="0"/>
            <col width="110"/>
            <col width="90"/>
            <col width="0"/>
            <thead>
              <tr valign="top">
                <td>Domain</td>
                <td>Responses, n</td>
                <td colspan="2">GPT-4, n (%)</td>
                <td colspan="3">Claude 2, n (%)</td>
                <td colspan="3">Google Bard, n (%)</td>
              </tr>
              <tr valign="bottom">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Without RISE<sup>a</sup></td>
                <td>With RISE</td>
                <td>Without RISE</td>
                <td>With RISE</td>
                <td colspan="2">Without RISE</td>
                <td>With RISE</td>
                <td>
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Concepts of diabetes</td>
                <td>4</td>
                <td>2 (50)</td>
                <td>4 (100)</td>
                <td>2 (50)</td>
                <td>2 (50)</td>
                <td colspan="2">2 (50)</td>
                <td>4 (100)</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Symptoms and causes</td>
                <td>4</td>
                <td>3 (75)</td>
                <td>3 (75)</td>
                <td>1 (25)</td>
                <td>3 (75)</td>
                <td colspan="2">2 (50)</td>
                <td>4 (100)</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Diabetes tests and diagnosis</td>
                <td>4</td>
                <td>4 (100)</td>
                <td>4 (100)</td>
                <td>3 (75)</td>
                <td>4 (100)</td>
                <td colspan="2">3 (75)</td>
                <td>3 (75)</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Managing diabetes</td>
                <td>15</td>
                <td>14 (93)</td>
                <td>15 (100)</td>
                <td>9 (60)</td>
                <td>14 (93)</td>
                <td colspan="2">14 (93)</td>
                <td>14 (93)</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Preventing diabetes problems</td>
                <td>16</td>
                <td>16 (100)</td>
                <td>16 (100)</td>
                <td>16 (100)</td>
                <td>16 (100)</td>
                <td colspan="2">16 (100)</td>
                <td>16 (100)</td>
                <td>
                  <break/>
                </td>
              </tr>
              <tr valign="top">
                <td>Total</td>
                <td>43</td>
                <td>39 (91)</td>
                <td>42 (98)</td>
                <td>31 (72)</td>
                <td>39 (91)</td>
                <td colspan="2">37 (86)</td>
                <td>41 (95)</td>
                <td>
                  <break/>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>RISE: Retrieval-augmented Information System for Enhancement.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Accuracy rates indicate the percentage rated as “Good” in accuracy evaluation.</p>
      </sec>
      <sec>
        <title>Comprehensiveness Evaluation</title>
        <p>The study also assessed the comprehensiveness of model responses through a 1- to 5-point rating scale by 3 clinicians for the responses rated as “good” (<xref ref-type="table" rid="table2">Table 2</xref>). The results revealed that the integration of RISE led to a decrease in the number of responses with scores lower than 3 and an increase in the number of responses with higher scores of (4,5]. For instance, after incorporation of RISE, the number of responses scoring (1, 2] and (2, 3] reduced from 3 to 0 for GPT-4, from 6 to 3 for Claude, and from 8 to 2 for Bard. In addition, the number of responses scoring (4, 5] increased from 19 to 38 for GPT-4, from 9 to 24 for Claude, and from 13 to 18 for Bard.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Comprehensiveness evaluation for responses of large language models with and without RISE.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="220"/>
            <col width="140"/>
            <col width="130"/>
            <col width="120"/>
            <col width="130"/>
            <col width="140"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Score range</td>
                <td colspan="2">GPT-4</td>
                <td colspan="2">Claude 2</td>
                <td colspan="2">Google Bard</td>
              </tr>
              <tr valign="bottom">
                <td>
                  <break/>
                </td>
                <td>Without RISE<sup>a</sup> (n=39)</td>
                <td>With RISE (n=42)</td>
                <td>Without RISE (n=31)</td>
                <td>With RISE (n=39)</td>
                <td>Without RISE (n=37)</td>
                <td>With RISE (n=41)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>(1, 2], n (%)</td>
                <td>2 (5)</td>
                <td>0 (0)</td>
                <td>2 (7)</td>
                <td>0 (0)</td>
                <td>2 (5)</td>
                <td>0 (0)</td>
              </tr>
              <tr valign="top">
                <td>(2, 3], n (%)</td>
                <td>1 (3)</td>
                <td>0 (0)</td>
                <td>4 (13)</td>
                <td>3 (8)</td>
                <td>6 (16)</td>
                <td>2 (5)</td>
              </tr>
              <tr valign="top">
                <td>(3, 4], n (%)</td>
                <td>17 (44)</td>
                <td>4 (10)</td>
                <td>16 (52)</td>
                <td>12 (31)</td>
                <td>16 (43)</td>
                <td>21 (51)</td>
              </tr>
              <tr valign="top">
                <td>(4, 5], n (%)</td>
                <td>19 (49)</td>
                <td>38 (91)</td>
                <td>9 (29)</td>
                <td>24 (62)</td>
                <td>13 (35)</td>
                <td>18 (44)</td>
              </tr>
              <tr valign="top">
                <td>Score, mean (SD)<sup>b</sup></td>
                <td>4.14 (0.72)</td>
                <td>4.69 (0.39)</td>
                <td>3.79 (0.78)</td>
                <td>4.20 (0.60)</td>
                <td>3.73 (0.80)</td>
                <td>4.10 (0.62)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>RISE: Retrieval-augmented Information System for Enhancement.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>SD: Standard deviation. For responses rated as “good” by most graders, comprehensiveness was further evaluated.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Furthermore, the average scores for comprehensiveness also improved significantly after integrating RISE. GPT-4’s average score increased from 4.14 (SD 0.72) to 4.69 (SD 0.39; <italic>P</italic>&#60;.001), Claude increased from 3.79 (SD 0.78) to 4.2 (SD 0.60; <italic>P</italic>=.002), and Bard increased from 3.73 (SD 0.80) to 4.10 (SD 0.62; <italic>P</italic>=.001). Among the 3 models, GPT-4 consistently achieved the highest scores for comprehensiveness both before and after the integration of RISE, with scores of 4.14 (SD 0.72) and 4.69 (SD 0.39), respectively.</p>
      </sec>
      <sec>
        <title>Understandability Evaluation</title>
        <p>In addition to assessing the accuracy and comprehensiveness of model responses by clinicians, this study also evaluated the public’s understanding of responses (<xref ref-type="table" rid="table3">Table 3</xref>). A total of 3 diabetes patients rated the understandability on a scale of 1 to 5. The results indicated the integration of RISE led to a decrease in the number of responses with scores lower than 4 and an increase in the number of responses with scores of (4, 5]. Specifically, after incorporation of RISE, the number of responses scoring lower than 4 was reduced from 15 to 4 for GPT-4, 27 to 24 for Claude, and 31 to 21 for Bard. In addition, the number of responses with higher scores of (4, 5] increased from 18 to 39 for GPT-4, 16 to 19 for Claude, and 12 to 22 for Bard after the incorporation of RISE.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Evaluation of public understandability in responses from large language models with and without RISE.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="190"/>
            <col width="140"/>
            <col width="150"/>
            <col width="120"/>
            <col width="130"/>
            <col width="0"/>
            <col width="140"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td>Score range</td>
                <td colspan="2">GPT-4</td>
                <td colspan="3">Claude 2</td>
                <td colspan="2">Google Bard</td>
              </tr>
              <tr valign="bottom">
                <td>
                  <break/>
                </td>
                <td>Without RISE<sup>a</sup></td>
                <td>With RISE</td>
                <td>Without RISE</td>
                <td>With RISE</td>
                <td colspan="2">Without RISE</td>
                <td>With RISE</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>(1, 2], n (%)</td>
                <td>0 (0)</td>
                <td>0 (0)</td>
                <td>0 (0)</td>
                <td>0 (0)</td>
                <td colspan="2">0 (0)</td>
                <td>0 (0)</td>
              </tr>
              <tr valign="top">
                <td>(2, 3], n (%)</td>
                <td>0 (0)</td>
                <td>0 (0)</td>
                <td>1 (2)</td>
                <td>2 (5)</td>
                <td colspan="2">1 (2)</td>
                <td>1 (2)</td>
              </tr>
              <tr valign="top">
                <td>(3, 4], n (%)</td>
                <td>15 (35)</td>
                <td>4 (9)</td>
                <td>26 (61)</td>
                <td>22 (51)</td>
                <td colspan="2">30 (70)</td>
                <td>20 (47)</td>
              </tr>
              <tr valign="top">
                <td>(4, 5], n (%)</td>
                <td>18 (65)</td>
                <td>39 (91)</td>
                <td>16 (37)</td>
                <td>19 (44)</td>
                <td colspan="2">12 (28)</td>
                <td>22 (51)</td>
              </tr>
              <tr valign="top">
                <td>Score, mean (SD)<sup>b</sup></td>
                <td>4.32 (0.61)</td>
                <td>4.64 (0.51)</td>
                <td>4.01 (0.73)</td>
                <td>4.07 (0.74)</td>
                <td colspan="2">3.96 (0.86)</td>
                <td>4.16 (0.82)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>RISE: Retrieval-augmented Information System for Enhancement.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>SD: Standard deviation. An understandability evaluation was conducted for all responses.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Furthermore, the average scores for understandability also improved after RISE integration. GPT-4’s average score increased from 4.32 (SD 0.61) to 4.64 (SD 0.51; <italic>P</italic>&#60;.001), Claude improved from 4.01 (SD 0.73) to 4.07 (SD 0.74; <italic>P</italic>=.31), and Bard elevated from 3.96 (SD 0.86) to 4.16 (SD 0.82; <italic>P</italic>=.002). Among the 3 models, GPT-4 consistently exhibited the highest understandability scores both before and after RISE integration, with scores of 4.32 (SD 0.61) and 4.64 (SD 0.51), respectively.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Considering the prevalence of type 2 diabetes mellitus as a major public health concern, particularly in light of the widespread dependence of patients on online resources for health-related information, this study introduces RISE workflow models to enhance the performance of LLMs as timely and relevant diabetes education tools [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref47">47</xref>]. Our findings demonstrate that RISE significantly improves the accuracy and comprehensiveness of LLM responses to patient queries about diabetes management and care. On average, the percentage of accurate responses increased by 12% (15/129) with RISE, with rates increasing by 7% (3/43) for GPT-4, 19% (8/43) for Claude 2, and 9% (4/43) for Google Bard. The framework also enhanced response comprehensiveness and understandability, improving mean scores by 0.44 (SD 0.10) and 0.19 (SD 0.13), respectively.</p>
      </sec>
      <sec>
        <title>Comparison to Previous Work</title>
        <p>Previous studies have also applied LLMs in diabetes management and education. A study by Sun et al [<xref ref-type="bibr" rid="ref7">7</xref>] found that 74.5% (149/200) of GPT-4’s answers accurately responded to 200 frequently asked questions on diabetes management education. Hernandez et al [<xref ref-type="bibr" rid="ref48">48</xref>] showed ChatGPT could correctly answer 98.5% (69/70) of patient questions about type 2 diabetes, and the 1.5% (1/70) inappropriate response needs to be improved. These findings were consistent with our results before integrating RISE, showing 91% (39/43) accuracy for base GPT-4 in responding to diabetes questions. Although most information provided by advanced LLMs may be correct, it is essential to realize that even small mistakes can potentially cause significant problems, especially with medical scenarios. Even minimal misinformation can lead to misconceptions, which might inadvertently delay treatment. Thus, minimizing potential errors and improving accuracy and validation are required before considering LLMs integration into patient diabetes care.</p>
        <p>RAG has shown promise in enhancing LLM performance [<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>], however, most current RAG approaches rely on fixed, smaller, static knowledge bases. Our results showed model responses were more specific and accurate than those generated by general LLMs after incorporating specific knowledge of the RISE framework, which is consistent with previous studies. Previous studies have applied RAG in other clinical specialties, such as general medicine, hepatology, and lymphoma [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. These studies’ knowledge bases were mainly medical texts, research papers, and disease guidelines, limiting their flexibility and generalizability. In contrast, the RISE framework used a local medical knowledge base from NIH (National Institutes of Health) and the dynamic, real-time retrieval of external knowledge through the latest medical guidelines, academic research papers, and reputable health websites.</p>
      </sec>
      <sec>
        <title>Future Directions</title>
        <p>The RISE framework demonstrates the potential of RAG in enhancing the performance of LLMs for diabetes education, and there are several promising directions for future research and development. These include creating large specialized medical knowledge bases tailored for diabetes education, integrating multimodal data such as medical images and electronic health records, and developing domain-specific retrieval and ranking algorithms for evidence-based information [<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. Furthermore, exploring the bilingual or multilingual potential of these chatbots, such as investigating the performance of the RISE framework when questions are asked in languages like Chinese, could expand their use in real-world clinical practice outside the English-speaking world. Another promising direction is exploring personalized RAG systems that adapt to individual preferences of patients and contexts. Ensuring RAG systems’ interpretability, transparency, privacy, and security is crucial in the medical domain.</p>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>This study has several strengths. First, novel RAG algorithms that effectively use local databases and external academic knowledge markedly improve the precision and real-time performance of responses to diabetes-related inquiries. Second, the RISE framework incorporates rigorous factual and safety checks for the generated outputs, ensuring reliable and secure responses.</p>
        <p>There are some limitations in the study. The RISE framework was developed and evaluated exclusively within the domain of diabetes. The generalizability of RISE to other medical domains remains uncertain. Future investigations should extend the application of the RISE framework to diverse medical specialties. Moreover, the scope of our evaluation was limited to these predetermined queries. Future research should conduct clinical trials to assess the RISE’s ability to effectively address inquiries of patients and enhance the efficiency of diabetes management in real-world clinical scenarios.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>In conclusion, the RISE framework shows promise as a safer and more reliable option for generating responses to common queries from diabetes patients. RISE significantly enhances the accuracy and comprehension of original LLM responses by retrieval of external knowledge from reliable sources. This framework can potentially be a supplementary tool to improve patient understanding and disease outcomes.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Code for the Retrieval-augmented Information System for Enhancement framework.</p>
        <media xlink:href="jmir_v26i1e58041_app1.zip" xlink:title="ZIP File  (Zip Archive), 18 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Sample of the Retrieval-augmented Information System for Enhancement Diabetes question and answer data set (n=50).</p>
        <media xlink:href="jmir_v26i1e58041_app2.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 23 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Responses for each model and raw scores for evaluation (clean).</p>
        <media xlink:href="jmir_v26i1e58041_app3.pdf" xlink:title="PDF File  (Adobe PDF File), 1979 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">FAISS</term>
          <def>
            <p>Facebook AI Similarity Search</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLMs</term>
          <def>
            <p>large language models</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">NIH</term>
          <def>
            <p>National Institutes of Health</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">RAG</term>
          <def>
            <p>Retrieval-augmented Generation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">RISE</term>
          <def>
            <p>Retrieval-augmented Information System for Enhancement</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The National Key Research and Development Program of China (2022YFC2502800); The National Natural Science Foundation of China (NSFC grant 82171034, 81721003); The High-Level Hospital Construction Project at Zhongshan Ophthalmic Center of Sun Yat-sen University (Grants 303010303058, 303020107, 303020108). The Young Talent Support Project (2022) of the Guangzhou Association for Science and Technology.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The complete data sets generated and analyzed during this study are available from the corresponding author on reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>Dingqiao Wang, Jiangbo Liang, and Jinguo Ye contributed equally as the first authors. Fei Li, Bo Qu, and Yingfeng Zheng contributed equally as co-corresponding authors. FL, BQ, and YZ contributed to conceptualization. DW, JL, and JY performed the methodology. The investigation was done by Jingni L, Jingpeng L, QZ, QH, CP, DW, ZL, WS, and DS. DW, JL, and JY wrote the original draft. FL, BQ, and YZ handled the writing-review and editing. FL, BQ, and YZ performed supervision. Final approval of the version to be published was done by all authors.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None Declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Buse</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Wexler</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Tsapas</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rossing</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mingrone</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mathieu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>D'Alessio</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Davies</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <article-title>2019 Update to: management of hyperglycemia in type 2 diabetes, 2018. A consensus report by the American diabetes association (ADA) and the European association for the study of diabetes (EASD)</article-title>
          <source>Diabetes Care</source>
          <year>2020</year>
          <volume>43</volume>
          <issue>2</issue>
          <fpage>487</fpage>
          <lpage>493</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31857443"/>
          </comment>
          <pub-id pub-id-type="doi">10.2337/dci19-0066</pub-id>
          <pub-id pub-id-type="medline">31857443</pub-id>
          <pub-id pub-id-type="pii">dci19-0066</pub-id>
          <pub-id pub-id-type="pmcid">PMC6971782</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pastors</surname>
              <given-names>JG</given-names>
            </name>
            <name name-style="western">
              <surname>Warshaw</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Daly</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Franz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kulkarni</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>The evidence for the effectiveness of medical nutrition therapy in diabetes management</article-title>
          <source>Diabetes Care</source>
          <year>2002</year>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>608</fpage>
          <lpage>613</lpage>
          <pub-id pub-id-type="doi">10.2337/diacare.25.3.608</pub-id>
          <pub-id pub-id-type="medline">11874956</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Nutrition policy and healthy China 2030 building</article-title>
          <source>Eur J Clin Nutr</source>
          <year>2021</year>
          <volume>75</volume>
          <issue>2</issue>
          <fpage>238</fpage>
          <lpage>246</lpage>
          <pub-id pub-id-type="doi">10.1038/s41430-020-00765-6</pub-id>
          <pub-id pub-id-type="medline">33219269</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41430-020-00765-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Can ChatGPT Help in the awareness of Diabetes?</article-title>
          <source>Ann Biomed Eng</source>
          <year>2023</year>
          <volume>51</volume>
          <issue>10</issue>
          <fpage>2125</fpage>
          <lpage>2129</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03356-1</pub-id>
          <pub-id pub-id-type="medline">37648882</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03356-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Accelerating the integration of ChatGPT and other large‐scale AI models into biomedical research and healthcare</article-title>
          <source>MedComm – Future Medicine</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e43</fpage>
          <pub-id pub-id-type="doi">10.1002/mef2.43</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chlorogiannis</surname>
              <given-names>DD</given-names>
            </name>
            <name name-style="western">
              <surname>Apostolos</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chlorogiannis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Palaiodimos</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Giannakoulas</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Pargaonkar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Xesfingi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kokkinidis</surname>
              <given-names>DG</given-names>
            </name>
          </person-group>
          <article-title>The Role of ChatGPT in the advancement of diagnosis, management, and prognosis of cardiovascular and cerebrovascular disease</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <volume>11</volume>
          <issue>21</issue>
          <fpage>2906</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11212906"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11212906</pub-id>
          <pub-id pub-id-type="medline">37958050</pub-id>
          <pub-id pub-id-type="pii">healthcare11212906</pub-id>
          <pub-id pub-id-type="pmcid">PMC10648908</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lan</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>An AI dietitian for type 2 diabetes mellitus management based on large language and image recognition models: preclinical concept validation study</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e51300</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e51300/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51300</pub-id>
          <pub-id pub-id-type="medline">37943581</pub-id>
          <pub-id pub-id-type="pii">v25i1e51300</pub-id>
          <pub-id pub-id-type="pmcid">PMC10667983</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lv</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ding</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Leveraging large language models for improved patient access and self-management: assessor-blinded comparison between expert- and AI-generated content</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e55847</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e55847/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/55847</pub-id>
          <pub-id pub-id-type="medline">38663010</pub-id>
          <pub-id pub-id-type="pii">v26i1e55847</pub-id>
          <pub-id pub-id-type="pmcid">PMC11082737</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stokel-Walker</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Van Noorden</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>What ChatGPT and generative AI mean for science</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>614</volume>
          <issue>7947</issue>
          <fpage>214</fpage>
          <lpage>216</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00340-6</pub-id>
          <pub-id pub-id-type="medline">36747115</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00340-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence-based text generators in hepatology: ChatGPT is just the beginning</article-title>
          <source>Hepatol Commun</source>
          <year>2023</year>
          <volume>7</volume>
          <issue>4</issue>
          <fpage>e0097</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36972383"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/HC9.0000000000000097</pub-id>
          <pub-id pub-id-type="medline">36972383</pub-id>
          <pub-id pub-id-type="pii">02009842-202304010-00011</pub-id>
          <pub-id pub-id-type="pmcid">PMC10043591</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Májovský</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Černý</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kasal</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Komarc</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Netuka</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence can generate fraudulent but authentic-looking scientific medical articles: Pandora's box has been opened</article-title>
          <source>J Med Internet Res</source>
          <year>2023</year>
          <volume>25</volume>
          <fpage>e46924</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2023//e46924/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46924</pub-id>
          <pub-id pub-id-type="medline">37256685</pub-id>
          <pub-id pub-id-type="pii">v25i1e46924</pub-id>
          <pub-id pub-id-type="pmcid">PMC10267787</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van Dis</surname>
              <given-names>EAM</given-names>
            </name>
            <name name-style="western">
              <surname>Bollen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zuidema</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>van Rooij</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bockting</surname>
              <given-names>CL</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: five priorities for research</article-title>
          <source>Nature</source>
          <year>2023</year>
          <volume>614</volume>
          <issue>7947</issue>
          <fpage>224</fpage>
          <lpage>226</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-00288-7</pub-id>
          <pub-id pub-id-type="medline">36737653</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-00288-7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azamfirei</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kudchadkar</surname>
              <given-names>SR</given-names>
            </name>
            <name name-style="western">
              <surname>Fackler</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Large language models and the perils of their hallucinations</article-title>
          <source>Crit Care</source>
          <year>2023</year>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>120</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ccforum.biomedcentral.com/articles/10.1186/s13054-023-04393-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13054-023-04393-x</pub-id>
          <pub-id pub-id-type="medline">36945051</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13054-023-04393-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC10032023</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lim</surname>
              <given-names>ZW</given-names>
            </name>
            <name name-style="western">
              <surname>Pushpanathan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yew</surname>
              <given-names>SME</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lam</surname>
              <given-names>JSH</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>DZ</given-names>
            </name>
            <name name-style="western">
              <surname>Goh</surname>
              <given-names>JHL</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>MCJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sheng</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Koh</surname>
              <given-names>VTC</given-names>
            </name>
            <name name-style="western">
              <surname>Tham</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking large language models' performances for myopia care: a comparative analysis of ChatGPT-3.5, ChatGPT-4.0, and Google bard</article-title>
          <source>EBioMedicine</source>
          <year>2023</year>
          <volume>95</volume>
          <fpage>104770</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2352-3964(23)00336-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ebiom.2023.104770</pub-id>
          <pub-id pub-id-type="medline">37625267</pub-id>
          <pub-id pub-id-type="pii">S2352-3964(23)00336-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10470220</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haddad</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Saade</surname>
              <given-names>JS</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on ophthalmology-related questions across various examination levels: observational study</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <volume>10</volume>
          <fpage>e50842</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e50842/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/50842</pub-id>
          <pub-id pub-id-type="medline">38236632</pub-id>
          <pub-id pub-id-type="pii">v10i1e50842</pub-id>
          <pub-id pub-id-type="pmcid">PMC10835593</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meo</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Khlaiwi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>AbuKhalaf</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Meo</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Klonoff</surname>
              <given-names>DC</given-names>
            </name>
          </person-group>
          <article-title>The scientific knowledge of bard and ChatGPT in endocrinology, diabetes, and diabetes technology: multiple-choice questions examination-based performance</article-title>
          <source>J Diabetes Sci Technol</source>
          <year>2023</year>
          <fpage>19322968231203987</fpage>
          <pub-id pub-id-type="doi">10.1177/19322968231203987</pub-id>
          <pub-id pub-id-type="medline">37798960</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>Rachel S</given-names>
            </name>
            <name name-style="western">
              <surname>Patrinely</surname>
              <given-names>J Randall</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>Cosby A</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerman</surname>
              <given-names>Eli</given-names>
            </name>
            <name name-style="western">
              <surname>Donald</surname>
              <given-names>Rebecca R</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>Sam S</given-names>
            </name>
            <name name-style="western">
              <surname>Berkowitz</surname>
              <given-names>Sean T</given-names>
            </name>
            <name name-style="western">
              <surname>Finn</surname>
              <given-names>Avni P</given-names>
            </name>
            <name name-style="western">
              <surname>Jahangir</surname>
              <given-names>Eiman</given-names>
            </name>
            <name name-style="western">
              <surname>Scoville</surname>
              <given-names>Elizabeth A</given-names>
            </name>
            <name name-style="western">
              <surname>Reese</surname>
              <given-names>Tyler S</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>Debra L</given-names>
            </name>
            <name name-style="western">
              <surname>Bastarache</surname>
              <given-names>Julie A</given-names>
            </name>
            <name name-style="western">
              <surname>van der Heijden</surname>
              <given-names>Yuri F</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>Jordan J</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>Fei</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>Nicholas</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>Matthew R</given-names>
            </name>
            <name name-style="western">
              <surname>Choe</surname>
              <given-names>Jennifer H</given-names>
            </name>
            <name name-style="western">
              <surname>Chastain</surname>
              <given-names>Cody A</given-names>
            </name>
            <name name-style="western">
              <surname>Zic</surname>
              <given-names>John A</given-names>
            </name>
            <name name-style="western">
              <surname>Horst</surname>
              <given-names>Sara N</given-names>
            </name>
            <name name-style="western">
              <surname>Turker</surname>
              <given-names>Isik</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>Rajiv</given-names>
            </name>
            <name name-style="western">
              <surname>Osmundson</surname>
              <given-names>Evan</given-names>
            </name>
            <name name-style="western">
              <surname>Idrees</surname>
              <given-names>Kamran</given-names>
            </name>
            <name name-style="western">
              <surname>Kiernan</surname>
              <given-names>Colleen M</given-names>
            </name>
            <name name-style="western">
              <surname>Padmanabhan</surname>
              <given-names>Chandrasekhar</given-names>
            </name>
            <name name-style="western">
              <surname>Bailey</surname>
              <given-names>Christina E</given-names>
            </name>
            <name name-style="western">
              <surname>Schlegel</surname>
              <given-names>Cameron E</given-names>
            </name>
            <name name-style="western">
              <surname>Chambless</surname>
              <given-names>Lola B</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>Michael K</given-names>
            </name>
            <name name-style="western">
              <surname>Osterman</surname>
              <given-names>Travis J</given-names>
            </name>
            <name name-style="western">
              <surname>Wheless</surname>
              <given-names>Lee E</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>Douglas B</given-names>
            </name>
          </person-group>
          <article-title>Accuracy and reliability of chatbot responses to physician questions</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <month>10</month>
          <day>02</day>
          <volume>6</volume>
          <issue>10</issue>
          <fpage>e2336483</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37782499"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.36483</pub-id>
          <pub-id pub-id-type="medline">37782499</pub-id>
          <pub-id pub-id-type="pii">2809975</pub-id>
          <pub-id pub-id-type="pmcid">PMC10546234</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hulman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dollerup</surname>
              <given-names>OL</given-names>
            </name>
            <name name-style="western">
              <surname>Mortensen</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Fenech</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Norman</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Støvring</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hansen</surname>
              <given-names>TK</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT- versus human-generated answers to frequently asked questions about diabetes: a turing test-inspired survey among employees of a Danish diabetes center</article-title>
          <source>PLoS One</source>
          <year>2023</year>
          <volume>18</volume>
          <issue>8</issue>
          <fpage>e0290773</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0290773"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0290773</pub-id>
          <pub-id pub-id-type="medline">37651381</pub-id>
          <pub-id pub-id-type="pii">PONE-D-23-12539</pub-id>
          <pub-id pub-id-type="pmcid">PMC10470899</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2020</year>
          <volume>33</volume>
          <fpage>1877</fpage>
          <lpage>1901</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <collab>OpenAI</collab>
          </person-group>
          <article-title>Gpt-4 technical report</article-title>
          <source>arxiv 2303.08774</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>5</issue>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Narang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bosma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mishra</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Roberts</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Palm: scaling language modeling with pathways</article-title>
          <source>Journal of Machine Learning Research</source>
          <year>2023</year>
          <volume>24</volume>
          <issue>240</issue>
          <fpage>1</fpage>
          <lpage>113</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Schuurmans</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bosma</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xia</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2022</year>
          <volume>35</volume>
          <fpage>24824</fpage>
          <lpage>24837</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Suzgun</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Freitag</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Srivats</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vosoughi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Language models are multilingual chain-of-thought reasoners</article-title>
          <source>arxiv.03057</source>
          <year>2022</year>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chalkidis</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Chatgpt may pass the bar exam soon, but has a long way to go for the lexglue benchmark</article-title>
          <source>arxiv: 2304.12202</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.2139/ssrn.4385460</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kasai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kasai</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sakaguchi</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Yamada</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Evaluating gpt-4 and chatgpt on Japanese medical licensing examinations</article-title>
          <source>arxiv: 2303.18027</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>West</surname>
              <given-names>CG</given-names>
            </name>
          </person-group>
          <article-title>AI and the FCI: can ChatGPT project an understanding of introductory physics?</article-title>
          <source>arxiv: 2303.01067</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Dan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>ChatDoctor: a medical chat model fine-tuned on a large language model meta-AI (LLaMA) using medical domain knowledge</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>6</issue>
          <fpage>e40895</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37492832"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.40895</pub-id>
          <pub-id pub-id-type="medline">37492832</pub-id>
          <pub-id pub-id-type="pmcid">PMC10364849</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Han</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Papaioannou</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Grundmann</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Oberhauser</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Löser</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>MedAlpaca--an open-source collection of medical conversational AI models and training data</article-title>
          <source>arXiv:230408247</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nori</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>King</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>McKinney</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Carignan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Horvitz</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Capabilities of gpt-4 on medical challenge problems</article-title>
          <source>arXiv:230313375</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ayers</surname>
              <given-names>JW</given-names>
            </name>
            <name name-style="western">
              <surname>Poliak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dredze</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Leas</surname>
              <given-names>EC</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Faix</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Longhurst</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Hogarth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>DM</given-names>
            </name>
          </person-group>
          <article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title>
          <source>JAMA Intern Med</source>
          <year>2023</year>
          <volume>183</volume>
          <issue>6</issue>
          <fpage>589</fpage>
          <lpage>596</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37115527"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id>
          <pub-id pub-id-type="medline">37115527</pub-id>
          <pub-id pub-id-type="pii">2804309</pub-id>
          <pub-id pub-id-type="pmcid">PMC10148230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Golovneva</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Poff</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Corredor</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zettlemoyer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fazel-Zarandi</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Roscoe: a suite of metrics for scoring step-by-step reasoning</article-title>
          <source>arXiv:221207919</source>
          <year>2022</year>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Soong</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sridhar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Si</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Sá</surname>
              <given-names>ACC</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>CY</given-names>
            </name>
          </person-group>
          <article-title>Improving accuracy of gpt-3/4 results on biomedical data using a retrieval-augmented language model</article-title>
          <source>arXiv:230517116</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Madaan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Tandon</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Hallinan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gao</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wiegreffe</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Self-refine: iterative refinement with self-feedback</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2024</year>
          <fpage>36</fpage>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Piktus</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Petroni</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Karpukhin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-augmented generation for knowledge-intensive nlp tasks</article-title>
          <source>Advances in Neural Information Processing Systems</source>
          <year>2020</year>
          <volume>33</volume>
          <fpage>9459</fpage>
          <lpage>9474</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guu</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tung</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Pasupat</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Retrieval augmented language model pre-training</article-title>
          <source>PMLR</source>
          <year>2020</year>
          <conf-name>International Conference on Machine Learning</conf-name>
          <conf-date>2020 July 13</conf-date>
          <conf-loc>Vienna, Austria</conf-loc>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Owens</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Galvez</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Gologorskaya</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pletcher</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Development of a liver disease-specific large language model chat interface using retrieval-augmented generation</article-title>
          <source>Hepatology</source>
          <year>2024</year>
          <pub-id pub-id-type="doi">10.1097/HEP.0000000000000834</pub-id>
          <pub-id pub-id-type="medline">38451962</pub-id>
          <pub-id pub-id-type="pii">01515467-990000000-00791</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Thongprayoon</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Suppadungsuk</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia Valencia</surname>
              <given-names>OA</given-names>
            </name>
            <name name-style="western">
              <surname>Cheungpasitporn</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Integrating retrieval-augmented generation with large language models in nephrology: advancing practical applications</article-title>
          <source>Medicina (Kaunas)</source>
          <year>2024</year>
          <volume>60</volume>
          <issue>3</issue>
          <fpage>445</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=medicina60030445"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/medicina60030445</pub-id>
          <pub-id pub-id-type="medline">38541171</pub-id>
          <pub-id pub-id-type="pii">medicina60030445</pub-id>
          <pub-id pub-id-type="pmcid">PMC10972059</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zakka</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chaurasia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dalal</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Moor</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fong</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ashley</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hirsch</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Langlotz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Melia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tullis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vogelsong</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Hiesinger</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>AIoa2300068</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38343631"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id>
          <pub-id pub-id-type="medline">38343631</pub-id>
          <pub-id pub-id-type="pmcid">PMC10857783</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Unlu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mailly</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Oates</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Tucci</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Varugheese</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wagholikar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Scirica</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Blood</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Aronson</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Retrieval augmented generation enabled generative pre-trained transformer 4 (GPT-4) performance for clinical trial screening</article-title>
          <source>medRxiv</source>
          <year>2024</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38370719"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2024.02.08.24302376</pub-id>
          <pub-id pub-id-type="medline">38370719</pub-id>
          <pub-id pub-id-type="pii">2024.02.08.24302376</pub-id>
          <pub-id pub-id-type="pmcid">PMC10871450</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-augmented generation across heterogeneous knowledge</article-title>
          <year>2022</year>
          <conf-name>Proceedings of the 2022 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies: Student Research Workshop</conf-name>
          <conf-date>2022 July 10</conf-date>
          <conf-loc>Washington</conf-loc>
          <pub-id pub-id-type="doi">10.18653/v1/2022.naacl-srw.7</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-generation synergy augmented large language models</article-title>
          <source>arXiv:231005149</source>
          <year>2023</year>
          <pub-id pub-id-type="doi">10.1109/icassp48485.2024.10448015</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>He</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Bhasuran</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Tian</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hanna</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shavor</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Arguello</surname>
              <given-names>LG</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>Quality of answers of generative large language models versus peer users for interpreting laboratory test results for lay patients: evaluation study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e56655</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e56655/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/56655</pub-id>
          <pub-id pub-id-type="medline">38630520</pub-id>
          <pub-id pub-id-type="pii">v26i1e56655</pub-id>
          <pub-id pub-id-type="pmcid">PMC11063893</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhuang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Iwinski</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wattenbarger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-augmented large language models for adolescent idiopathic scoliosis patients in shared decision-making</article-title>
          <year>2023</year>
          <conf-name>Proceedings of the 14th ACM International Conference on Bioinformatics, Computational Biology, and Health Informatic</conf-name>
          <conf-date>2023 Oct 04</conf-date>
          <conf-loc>USA</conf-loc>
          <fpage>1</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1145/3584371.3612956</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>NCBI</collab>
          </person-group>
          <source>PMC</source>
          <access-date>2023-12-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ncbi.nlm.nih.gov/pmc/">https://www.ncbi.nlm.nih.gov/pmc/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="web">
          <source>Diabetes</source>
          <access-date>2023-12-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.niddk.nih.gov/health-information/diabetes">https://www.niddk.nih.gov/health-information/diabetes</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Lawati</surname>
              <given-names>JA</given-names>
            </name>
          </person-group>
          <article-title>Diabetes mellitus: a local and global public health emergency!</article-title>
          <source>Oman Med J</source>
          <year>2017</year>
          <volume>32</volume>
          <issue>3</issue>
          <fpage>177</fpage>
          <lpage>179</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/28584596"/>
          </comment>
          <pub-id pub-id-type="doi">10.5001/omj.2017.34</pub-id>
          <pub-id pub-id-type="medline">28584596</pub-id>
          <pub-id pub-id-type="pii">OMJ-D-17-00064</pub-id>
          <pub-id pub-id-type="pmcid">PMC5447787</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Jawaid</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Sajjad</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT - Reshaping medical education and clinical management</article-title>
          <source>Pak J Med Sci</source>
          <year>2023</year>
          <volume>39</volume>
          <issue>2</issue>
          <fpage>605</fpage>
          <lpage>607</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36950398"/>
          </comment>
          <pub-id pub-id-type="doi">10.12669/pjms.39.2.7653</pub-id>
          <pub-id pub-id-type="medline">36950398</pub-id>
          <pub-id pub-id-type="pii">PJMS-39-605</pub-id>
          <pub-id pub-id-type="pmcid">PMC10025693</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Vazquez Gonzalez</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Polianovskaia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Amoro Sanchez</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Muyolema Arce</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Mustafa</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Vypritskaya</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Perez Gutierrez</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Bashir</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Eighaei Sedeh</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>The future of patient education: AI-driven guide for type 2 diabetes</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <volume>15</volume>
          <issue>11</issue>
          <fpage>e48919</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38024047"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.48919</pub-id>
          <pub-id pub-id-type="medline">38024047</pub-id>
          <pub-id pub-id-type="pmcid">PMC10654048</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking large language models in retrieval-augmented generation</article-title>
          <year>2024</year>
          <conf-name>Proceedings of the AAAI Conference on Artificial Intelligence</conf-name>
          <conf-date>2024 Mar 10</conf-date>
          <conf-loc>Beijing, China</conf-loc>
          <fpage>17754</fpage>
          <lpage>17762</lpage>
          <pub-id pub-id-type="doi">10.1609/aaai.v38i16.29728</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lozano</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fleming</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Chiang</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Clinfo.ai: an open-source retrieval-augmented large language model system for answering medical questions using scientific literature</article-title>
          <source>Pac Symp Biocomput</source>
          <year>2024</year>
          <volume>29</volume>
          <fpage>8</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://psb.stanford.edu/psb-online/proceedings/psb24/abstracts/2024_p8.html"/>
          </comment>
          <pub-id pub-id-type="medline">38160266</pub-id>
          <pub-id pub-id-type="pii">9789811286421_0002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khene</surname>
              <given-names>ZE</given-names>
            </name>
            <name name-style="western">
              <surname>Bigot</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Mathieu</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Rouprêt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bensalah</surname>
              <given-names>K</given-names>
            </name>
            <collab>French Committee of Urologic Oncology</collab>
          </person-group>
          <article-title>Development of a personalized chat model based on the European association of urology oncology guidelines: harnessing the power of generative artificial intelligence in clinical practice</article-title>
          <source>Eur Urol Oncol</source>
          <year>2024</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>160</fpage>
          <lpage>162</lpage>
          <pub-id pub-id-type="doi">10.1016/j.euo.2023.06.009</pub-id>
          <pub-id pub-id-type="medline">37474402</pub-id>
          <pub-id pub-id-type="pii">S2588-9311(23)00139-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A survey on retrieval-augmented text generation for large language models</article-title>
          <source>arXiv:240410981</source>
          <year>2024</year>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Touvron</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lavril</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Izacard</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Martinet</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lachaux</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lacroix</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Llama: open and efficient foundation language models</article-title>
          <source>arxiv.13974</source>
          <year>2023</year>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
