<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e79379</article-id>
      <article-id pub-id-type="pmid">41159599</article-id>
      <article-id pub-id-type="doi">10.2196/79379</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluating Web Retrieval–Assisted Large Language Models With and Without Whitelisting for Evidence-Based Neurology: Comparative Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Wu</surname>
            <given-names>Peng</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mittermaier</surname>
            <given-names>Mirja</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Zahedivash</surname>
            <given-names>Aydin</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Masanneck</surname>
            <given-names>Lars</given-names>
          </name>
          <degrees>MSc, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Neurology</institution>
            <institution>Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf</institution>
            <addr-line>Moorenstr. 5</addr-line>
            <addr-line>Dusseldorf, 40225</addr-line>
            <country>Germany</country>
            <phone>49 021181 19532</phone>
            <email>lars.masanneck@med.uni-duesseldorf.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2496-1415</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Epping</surname>
            <given-names>Paula Zoe</given-names>
          </name>
          <degrees>BSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-6219-988X</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Meuth</surname>
            <given-names>Sven G</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2571-3501</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Pawlitzki</surname>
            <given-names>Marc</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3080-2277</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Neurology</institution>
        <institution>Medical Faculty and University Hospital Düsseldorf, Heinrich Heine University Düsseldorf</institution>
        <addr-line>Dusseldorf</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Lars Masanneck <email>lars.masanneck@med.uni-duesseldorf.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>29</day>
        <month>10</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e79379</elocation-id>
      <history>
        <date date-type="received">
          <day>20</day>
          <month>6</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>2</day>
          <month>9</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>5</day>
          <month>9</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>9</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Lars Masanneck, Paula Zoe Epping, Sven G Meuth, Marc Pawlitzki. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 29.10.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e79379" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) coupled with real-time web retrieval are reshaping how clinicians and patients locate medical evidence, and as major search providers fuse LLMs into their interfaces, this hybrid approach might become the new “gateway” to the internet. However, open-web retrieval exposes models to nonprofessional sources, risking hallucinations and factual errors that might jeopardize evidence-based care.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to quantify the impact of guideline-domain whitelisting on the answer quality of 3 publicly available Perplexity web-based retrieval-augmented generation (RAG) models and compare their performance using a purpose-built, biomedical literature RAG system (OpenEvidence).</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We applied a validated 130-item question set derived from the American Academy of Neurology (AAN) guidelines (65 factual and 65 case based). Perplexity Sonar, Sonar-Pro, and Sonar-Reasoning-Pro were each queried 4 times per question with open-web retrieval and again with retrieval restricted to aan.com and neurology.org (“whitelisted”). OpenEvidence was queried 4 times. Two neurologists, blinded to condition, scored each response (0=wrong, 1=inaccurate, and 2=correct); any disagreements that arose were resolved by a third neurologist. Ordinal logistic models were used to assess the influence of question type and source category (AAN or neurology vs nonprofessional) on accuracy.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>From the 3640 LLM answers that were rated (interrater agreement: κ=0.86), correct-answer rates were as follows (open vs whitelisted, respectively): Sonar, 60% vs 78%, Sonar-Pro, 80% vs 88%, and Sonar-Reasoning-Pro, 81% vs 89%; for OpenEvidence, the correct-answer rate was 82%. A Friedman test on modal scores across the 7 configurations was significant (<italic>χ</italic><sup>2</sup><sub>6</sub>=73.7; <italic>P</italic>&#60;.001). Whitelisting improved mean accuracy on the 0 to 2 scale by 0.23 for Sonar (95% CI 0.12-0.34), 0.08 for Sonar-Pro (95% CI 0.01-0.16), and 0.08 for Sonar-Reasoning-Pro (95% CI 0.02-0.13). Including ≥1 nonprofessional source halved the odds of a higher rating in Sonar (odds ratio [OR] 0.50, 95% CI 0.37-0.66; <italic>P</italic>&#60;.001), whereas citing an AAN or neurology document doubled it (OR 2.18, 95% CI 1.64-2.89; <italic>P</italic>&#60;.001). Furthermore, factual questions outperformed case vignettes across Perplexity models (ORs ranged from 1.95, 95% CI 1.28-2.98 [Sonar + whitelisting] to 4.28, 95% CI 2.59-7.09 [Sonar-Reasoning-Pro]; all <italic>P</italic>&#60;.01) but not for OpenEvidence (OR 1.44, 95% CI 0.92-2.27; <italic>P</italic>=.11).</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Restricting retrieval to authoritative neurology domains yielded a clinically meaningful 8 to 18 percentage-point gain in correctness and halved output variability, upgrading a consumer search assistant to a decision-support-level tool that at least performed on par with a specialized literature engine. Lightweight source control is therefore a pragmatic safety lever for maintaining continuously updated, web-based RAG-augmented LLMs fit for evidence-based neurology.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>neurology</kwd>
        <kwd>large language models</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>evidence-based medicine</kwd>
        <kwd>medical guidelines</kwd>
        <kwd>information retrieval</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Large language models (LLMs) are rapidly being explored in health care for abilities ranging from extraction, labeling, and interpretation of clinical data [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>] to support for clinical decisions [<xref ref-type="bibr" rid="ref5">5</xref>]. Despite these promising applications, LLMs often struggle with factual accuracy and reasoning and are known to “hallucinate” [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>], which implies that they can produce fluent, seemingly authoritative responses that are entirely fabricated or factually incorrect.</p>
      <p>Retrieval-augmented generation (RAG) has emerged as a promising strategy to mitigate hallucinations by constraining LLMs to use vetted reference material [<xref ref-type="bibr" rid="ref8">8</xref>]. Rather than relying only on LLM output, a RAG-based system retrieves relevant information or documents (eg, clinical guidelines and PubMed articles) from a knowledge store (eg, a database or a web search) and requires the model to respond based on that content. By grounding outputs in a high-quality evidence-based context and optionally reporting the sources, systems such as the Almanac by Zakka et al [<xref ref-type="bibr" rid="ref9">9</xref>] observed gains in factuality and safety over LLM or search products like ChatGPT or Bing when answering medical specialty questions. Large medical benchmarking efforts have further highlighted that RAG elevates smaller models to performance levels of larger, more resource-intensive LLMs [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. In our previous work, we have highlighted the potential of web search–based RAG setups, with a general-purpose system by the commercial AI provider Perplexity [<xref ref-type="bibr" rid="ref12">12</xref>] outperforming most other tested LLMs on a question set based on the guidelines for neurology according to the American Academy of Neurology (AAN) [<xref ref-type="bibr" rid="ref13">13</xref>]. In this setup, the web-RAG Perplexity model was also the only one that did not hallucinate sources [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      <p>Although previous studies have already indicated that LLM-driven answering can surpass traditional web search in certain aspects of medical information accuracy and relevance [<xref ref-type="bibr" rid="ref14">14</xref>], a combination of both could well shape access to medical information in the future. As dominant web search providers such as Google increasingly integrate LLMs into the search experience [<xref ref-type="bibr" rid="ref15">15</xref>], and major LLM chat services like ChatGPT [<xref ref-type="bibr" rid="ref16">16</xref>] and Claude [<xref ref-type="bibr" rid="ref17">17</xref>] simultaneously move toward web-supported generation, this paradigm of combining LLM generation with web retrieval might become the new norm. While the combination of both technologies promises technical innovation and efficiency boosts [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], such a shift could have profound implications for how the public—including medical professionals and patients—access and evaluate online information. Furthermore, while a web-based system has the advantage of having continuous access to new information, it might also be overwhelmed by nonprofessional sources, motivating certain professional LLM system providers, such as OpenEvidence [<xref ref-type="bibr" rid="ref19">19</xref>], to index only biomedical peer-reviewed literature.</p>
      <p>Given these developments, we asked a pragmatic set of questions pertinent to clinicians already experimenting with “AI-powered search.” How well do widely available web-RAG assistants perform out of the box? By how much do they improve when their retrieval is restricted to authoritative guideline domains, and how do they compare with a purpose-built, literature-only tool? To address these questions, we evaluated Perplexity’s 3 public tiers on the previously published guideline 130-item AAN benchmark [<xref ref-type="bibr" rid="ref13">13</xref>], first with the models’ default open-web retrieval and then with domain-level whitelisting to aan.com and neurology.org. To make a comparison, the same questions were posed to the professional medical OpenEvidence [<xref ref-type="bibr" rid="ref19">19</xref>] available to health care professionals. See <xref rid="figure1" ref-type="fig">Figure 1</xref> for a study outline.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>Experimental setup and study overview. Overview of the study setup for benchmarking clinical answer quality across commercial and medical domain web search–based large language model (LLM) products. A set of 130 American Academy of Neurology (AAN)-based neurology questions was asked using 3 Perplexity models—Sonar, Sonar-Pro, and Sonar-Reasoning-Pro—as well as OpenEvidence, a medical RAG system. Each Perplexity model was tested with and without whitelisting based on the AAN and neurology (AAN-related journals) domains. Outputs were rated by blinded experts on a 3-point scale (correct, inaccurate, and wrong) and assessed for type of source. API: application programming interface; RAG: retrieval-augmented generation.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e79379_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Labeling Setup and Data Set</title>
        <p>We evaluated the accuracy of answers returned by Perplexity’s 3 commercially available model tiers—Sonar, Sonar-Pro, and Sonar-Reasoning-Pro—queried with and without Perplexity’s native whitelisting option. Sonar, at the time of querying, was based on Llama 3.3 70B [<xref ref-type="bibr" rid="ref20">20</xref>], with Sonar-Pro also based on the same model with additional capabilities (expanded context window of 200,000 tokens instead of 128,000, allowing it to process more input sources). Sonar-Reasoning-Pro was based on DeepSeek-R1 with weights publicly available [<xref ref-type="bibr" rid="ref21">21</xref>]. Perplexity was selected as the application programming interface (API) for this experiment because of its broad functionality (including whitelisting) and its early leadership and visibility in web-grounded, RAG-assisted LLM answering.</p>
        <p>To benchmark against a purpose-built medical system, we added OpenEvidence (no detailed technical information available), a RAG service that uses a proprietary index of biomedical literature and is available to health care professionals [<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>We used the previously published 130-item AAN benchmark with 65 clinical-case questions and 65 factual knowledge questions [<xref ref-type="bibr" rid="ref13">13</xref>]. The underlying items were written de novo from AAN guidance and partially paraphrased; half are clinical vignettes that require applying multiple recommendations to a scenario rather than matching page wording. As all questions are based on respective AAN guidelines, which are published on the AAN website and also often published in the neurology family of journals, whitelisting options were set to restrict the search space to websites based on the aan.com and neurology.org domains. All Perplexity runs were executed on March 18, 2025; each model or whitelisting combination was prompted 4 independent times to capture answer variance (identical system or user prompts). OpenEvidence was queried 4 times per item between March 11 and 15, 2025 (however, identical answers were returned on repeat calls potentially owing to server-side caching).</p>
      </sec>
      <sec>
        <title>Labeling Process</title>
        <p>Two physicians (LM and MP) with a background in neurology (clinical experience 5 years and 10 years, respectively) blindly scored every answer: 2=correct, 1=inaccurate, 0=wrong. A third senior neurologist (SGM; 21 years of experience) resolved any disagreements that arose. Raters were blinded to model and setup. As in a previous work [<xref ref-type="bibr" rid="ref13">13</xref>], raters were instructed to classify an answer as “correct” if the recommendation itself was accurate even if minor errors in the reasoning process were present. Responses were labeled “inaccurate” if they were partially incomplete or contained even minor errors that could lead to misunderstanding of an otherwise generally correct answer. Responses were labeled “wrong” when the question was not answered properly or contained incorrect, very incomplete, or potentially medically harmful or misleading information. Different examples of these rating categories are shown in the referenced work [<xref ref-type="bibr" rid="ref13">13</xref>] and in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>A separate investigator (PZE) labeled every cited reference as professional (peer-reviewed journals, official guidelines, or professional society web pages) or nonprofessional (news sites, blogs, or generic health portals). Citations to aan.com or neurology.org were flagged as AAN related. None of the models showed any meaningful “source hallucination” [<xref ref-type="bibr" rid="ref13">13</xref>]; therefore, a hallucination subanalysis was not conducted.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>We addressed 2 prespecified questions. To ensure comparability with previous work [<xref ref-type="bibr" rid="ref13">13</xref>], we performed a comparison using modal answers for questions for each condition and performed a Friedman test; significant results were followed up by post hoc Wilcoxon signed-rank pairwise comparisons with Holm multiple testing adjustments. For a more nuanced analysis of differences between nonwhitelisting and whitelisting models, we treated the ordinal rating scale as a quasi-interval and summarized the mean paired differences (Δ); additionally, we reported Cliff δ as an ordinal, distribution-free effect size. Owing to the ordinal underlying scale, whitelisting conditions were compared using Wilcoxon signed-rank tests and Holm corrections, and 95% CIs of mean changes were estimated using bias-corrected bootstrapping (5000 resamples).</p>
        <p>We conducted two ordinal logistic regression analyses to test the influence of (1) the question type (knowledge vs case) and (2) retrieved source type on ordinal answer quality ratings. For each model category, we fitted an ordered-logit model. In the first analysis, we entered a single binary predictor containing the question type. In the second analysis, we instead entered 2 binary predictors—presence versus absence of an AAN or neurology journal citation and presence versus absence of a nonprofessional source (each with “absent” as the reference level). OpenEvidence was excluded from the second analysis as it never cites nonprofessional sources. We subjected the <italic>P</italic> values extracted from the Wald test again to Holm correction.</p>
        <p>All analysis codes and underlying data, including model answers and ratings, can be accessed at the following GitHub repository [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>No human subjects were involved in this research. As all analyses are based on a previously published data set without any actual patient data, in keeping with §15(1) of the Professional Code of Conduct (for physicians practicing in Germany) [<xref ref-type="bibr" rid="ref23">23</xref>], ethics committee consultation is not required for conducting this study.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Model Performance Across Configurations</title>
        <p>The analysis generated 3640 answers to 130 questions from 3 Perplexity models (Sonar, Sonar-Pro, and Sonar-Reasoning-Pro), each with and without whitelisting, 4 replicates per prompt, plus 4 iterations of OpenEvidence per question (when trying to prompt multiple times, it showed no variation, presumably because of question caching). Interrater agreement between primary raters of answer quality was good at κ=0.86.</p>
        <p>The basic model Sonar produced the lowest proportion of correct answers (314/520, 60.4%) and the highest rate of wrong answers (61/520, 11.7%). Answer quality improved with model capabilities to 79.8% (415/520) correct answers for Sonar-Pro and 81.2% (422/520) for Sonar-Reasoning-Pro. Whitelisting improved all these models to 78.1% (406/520; Sonar), 87.7% (456/520; Sonar-Pro), and 89% (463/520; Sonar-Reasoning-Pro). OpenEvidence, which as a product also works with indexed medical information, scored somewhere between these scores, at 82.5% (429/520) responses rated correct (<xref rid="figure2" ref-type="fig">Figure 2</xref>; Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for tabular representation). A Friedman test on modal ratings across all 7 model conditions was significant (<italic>χ</italic><sup>2</sup><sub>6</sub>=73.7; <italic>P</italic>&#60;.001), with post hoc Holm-adjusted Wilcoxon tests showing all models, including whitelisting Sonar, significantly outperforming nonwhitelisting Sonar (all adjusted <italic>P</italic>&#60;.01; see Table S3 and Figure S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for details). The top performer (Sonar-Reasoning-Pro + whitelisting) also exceeded Sonar + whitelisting (adjusted <italic>P</italic>&#60;.001).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Overall response quality across models and configurations. Quality of generated answers under different system configurations, with and without the whitelisting (WL) of AAN domains (+WL). For OpenEvidence, the exact technical setup is not known, but some sort of whitelisting or indexing is assumed (assumed WL). Bars show the percentage of responses judged correct (light blue), inaccurate (orange), or wrong (red). AAN: American Academy of Neurology.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e79379_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Effects of Whitelisting on Answer Quality</title>
        <p>Adding whitelisting systematically improved each Perplexity tier. Mean paired differences (Δ) on the 0 to 2 scale were 0.23 for Sonar (95% CI 0.12-0.34; Cliff δ=0.22, adjusted <italic>P</italic>&#60;.001); 0.08 for Sonar-Pro (95% CI 0.01-0.16; Cliff δ=0.12, adjusted <italic>P</italic>=.02); and 0.08 for Sonar-Reasoning-Pro (95% CI 0.02-0.13; Cliff δ=0.20, adjusted <italic>P</italic>=.002; <xref rid="figure3" ref-type="fig">Figure 3</xref>). Of note, Cliff δ for all models also indicated a small effect.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Impact of whitelisting (WL) on question‐level accuracy across Perplexity model setups. Panels A-C (Sonar), D-F (Sonar-Pro), and G-I (Sonar-Reasoning-Pro) each show, from left to right, paired means (A, D, and G): mean rating (0=wrong, 1=inaccurate, and 2=correct) for each question answered by the unaltered model (left) and the whitelisting-restricted version (right); gray lines connect the paired values. Mean differences (B, E, and H): black diamond with or without error bars mark the bias-corrected accelerated 95% bootstrap CI of the mean paired difference (Δ). The annotation box reports Δ (CI) &#124; Cliff δ (ordinal effect size) on the first line, and the 2-sided Wilcoxon signed-rank <italic>P</italic> value together with its Holm-adjusted value on the second line. Positive values favor WL. Ordinal distribution (C, F, and I): stacked horizontal bars of the proportion of answers rated wrong (0; red), inaccurate (1; orange), or correct (2; blue) for non-WL versus WL. AAN: American Academy of Neurology; Adj.: adjusted; diff: differences; dist.: distributions.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e79379_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Influence of Question Type and Source Quality</title>
        <p>When we analyzed the influence of the question type on answer quality, ordered-logit models showed higher accuracy on factual knowledge than case-based questions for every Perplexity configuration (odds ratio [OR] range from 1.95, 95% CI 1.28-2.98 (Sonar + whitelisting) to 4.28, 95% CI 2.59-7.09 (Sonar-Reasoning-Pro nonwhitelisting), all <italic>P</italic>&#60;.01), whereas OpenEvidence displayed no such imbalance (OR 1.44, 95% CI 0.92-2.27, <italic>P</italic>=.11; see Figure S5 and Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for detailed statistics).</p>
        <p>Similarly, the quality of the retrieved sources also had an influence on the response quality, especially for lower-tier models. For Sonar, retrieving ≥1 nonprofessional sources halved the odds of a higher rating (OR 0.50, adjusted <italic>P</italic>&#60;.001), whereas inclusion of an AAN or neurology citation more than doubled it (OR 2.18, adjusted <italic>P</italic>&#60;.001). The effect of source quality depreciated with model capabilities: only Sonar-Pro benefited significantly from guideline citations (OR 6.85, adjusted <italic>P</italic>=.01) while Sonar-Reasoning-Pro showed no significant source dependence (<xref rid="figure4" ref-type="fig">Figure 4</xref>; see Table S7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for tabular representation).</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Forest plot of stratified ordinal logistic regression for predictors of question-level accuracy across Perplexity model categories. Each model (Sonar, Sonar-Pro, and Sonar-Reasoning-Pro) is listed on the y-axis and hosts 2 predictors, offset vertically: AAN Neurology Citation (square marker, blue graphic) and nonprofessional source (circle marker, orange graphic). Markers denote the estimated odds ratios (ORs) on a logarithmic x-axis, with horizontal whiskers showing 95% CIs. Numeric OR values are printed above each point; a dashed vertical line at OR=1 indicates the null effect. Significance after Holm correction is marked by an asterisk beneath the corresponding marker. AAN: American Academy of Neurology.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e79379_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Rating variability across batches of the same question varied from almost a third of questions (39/120, 32.5%) for the worst-performing model (Sonar; nonwhitelisting) to under a fifth for better performing models + whitelisting (eg, Sonar-Pro + whitelisting: 16/120, 13.3%; Figure S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <p>In other words, relatively, all but the weakest Perplexity configuration (Sonar; nonwhitelisting) surpassed the previously tested, supposedly larger online RAG Llama-3.1-405B model (67% correct) from the previous benchmark. The whitelisting setups of Sonar-Pro and Sonar-Reasoning-Pro performed even slightly better (1%-2%) than the previous best-performing model, which was a GPT-4o setup with direct access to only the relevant guidelines via RAG.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Overview</title>
        <p>In this comparative evaluation of commercially available online retrieval-augmented LLMs, answer accuracy scaled with both model capacity and—critically—domain-restricted retrieval. Activating Perplexity’s feature to restrict the search space to aan.com and neurology.org domains yielded absolute gains of 8 to 18 percentage points and halved the variance of stochastic outputs, most strikingly in the smaller Sonar tier. The best configuration achieved 89% guideline-consistent answers, outperforming a purpose-built, literature-only engine (OpenEvidence) and a previously tested specialized GPT-4o RAG system [<xref ref-type="bibr" rid="ref13">13</xref>].</p>
      </sec>
      <sec>
        <title>Whitelisting as a Low‑Tech but High-Yield Intervention</title>
        <p>By activating whitelisting, we converted a general‑purpose consumer tool into a near‑domain‑specific assistant without additional tuning, supporting recent evidence that retrieval quality—not just model scale—drives factual performance in medical RAG pipelines [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. By cutting nonprofessional URLs out of the source pool, whitelisting also removes a major route for hallucinations and unsafe content, which pose a persistent threat to the factuality of LLMs.</p>
        <p>Improvements were inversely proportional to baseline strength: the entry-level Sonar produced a wrong answer every eighth response, but whitelisting trimmed that error rate by one‑third. By contrast, Sonar‑Pro and Sonar‑Reasoning‑Pro were already strong; still, they gained approximately 8% in correct responses (although the “wrong” error rate was almost unaffected), underlining that source control remains worthwhile even for state‑of‑the‑art models. Ordinal-logit analysis confirmed the pattern: nonprofessional citations halved the odds of a correct answer in basic Sonar, whereas AAN or neurology journal links independently boosted accuracy in both Sonar and Sonar-Pro, with no significant effects in Sonar-Reasoning-Pro. While it is hard to extrapolate too much in this closed-model-setting, these findings suggest that fewer sources and less capable models benefit most from a cleaner evidence pool and from not having to judge a mixed-quality source. Across all tiers, factual knowledge items were easier than inferring case vignettes, a pattern consistent with earlier neurology [<xref ref-type="bibr" rid="ref13">13</xref>] and multispecialty studies [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], and one that has likewise been observed among medical students in an earlier study [<xref ref-type="bibr" rid="ref26">26</xref>]. This trend, interestingly, was not significant for the OpenEvidence comparison, which prompts further investigation into potential mediators and the underlying design of these products.</p>
      </sec>
      <sec>
        <title>Marginal Benefits of Explicit “Reasoning”</title>
        <p>The DeepSeek-R1–based reasoning model was not markedly superior to the nonreasoning Sonar-Pro, with only minor, not clinically relevant improvements also observed in other traditional health care benchmarks such as the US medical examination, where DeepSeek-R1, for example, performed roughly on par with a nonreasoning GPT-4o model tested in another study [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. As recently proposed open-ended and more gradual benchmarks such as OpenAI’s HealthBench show larger differences between both model types [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], this might also well be because many benchmarks—such as medical exams or the one used in this study—might be too simple for effective benchmarking.</p>
      </sec>
      <sec>
        <title>What Are Acceptable Thresholds?</title>
        <p>Although the absence of statistically significant differences was noted, Perplexity’s higher tiers under whitelisting performed at least on par with OpenEvidence, a purpose-built, literature-only engine. This indicates that carefully scoped web retrieval can achieve performance comparable to curated medical indexes. Whether an 89% correctness rate in the best-performing system is sufficient for clinical decision support remains context-dependent and debatable. European regulators, for example, increasingly view medical‐domain LLMs as medical devices, triggering AI Act transparency and postmarket surveillance requirements [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. The first European Conformité Européenne (CE)‑marked RAG-based LLM system (“Prof Valmed”) [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>] secured its certification through transparent and traceable evidence links, which is precisely the mechanism that whitelisted, web search LLMs leverage. Nevertheless, there is no certainty on how certified medical products will compete with rapidly evolving, less-regulated industry alternatives, given the regulatory constraints that inherently limit adaptability [<xref ref-type="bibr" rid="ref34">34</xref>]. Ultimately, decisions about system deployment are likely to hinge less on statistical significance and more on local clinical governance, effective postdeployment monitoring, clear accountability structures, and pragmatic user adoption. It also seems improbable that policymakers will significantly restrict medical web-based search applications powered by LLMs, considering they have not broadly intervened against general LLMs despite analogous concerns [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study was limited to the single clinical domain of neurology. We further assume that our fixed whitelisting worked because neurology has well-defined, high-quality guidelines on certain domains. While we expect the general principle to work across all medical disciplines, the effect size of source curation might differ depending on guideline quality, guideline availability, and its algorithmic retrieval frequency. This highlights another limitation: the intransparency of the used commercial LLM (and search) systems, which might be updated silently after publication, thereby limiting reproducibility. One pragmatic way to mitigate this is to leverage open-weight analogs—open-source models with publicly available parameters—which would allow independent verification of results and scrutiny of model behavior [<xref ref-type="bibr" rid="ref36">36</xref>]. Such open models can be continuously audited and fine-tuned, helping secure consistent performance even as proprietary systems evolve. Using open-source libraries [<xref ref-type="bibr" rid="ref37">37</xref>], these can also be equipped with custom RAG setups, including grounding with search results. Importantly, alternative model setups might yield different absolute results, although we believe the observed trends would hold.</p>
        <p>Furthermore, because the benchmark used here is derived from AAN guidance and several configurations restricted retrieval to AAN-related domains, domain-source coupling may have contributed to the observed gains. However, items were partially paraphrased, and the results did not produce a ceiling effect. Furthermore, a literature-only comparator (OpenEvidence, which likely contained AAN literature among many other) revealed a similar performance, suggesting that authoritative source control rather than site identity was the main driver. Nevertheless, this concept should be tested with other authoritative benchmark whitelisting combinations, for example, using AAN-based questions but whitelisting European neurology guidelines.</p>
      </sec>
      <sec>
        <title>Future Work: Potential of Request-Tailored Whitelisting</title>
        <p>For application across specialties or varying question types, one might need a more flexible approach, which is why we considered a 2-stage pipeline for future work:</p>
        <list list-type="order">
          <list-item>
            <p>A model to select sources—a lightweight agent (possibly database-linked) that sets the whitelisting based on query intent (eg, medical field, query type) and other factors such as user role.</p>
          </list-item>
          <list-item>
            <p>LLM + RAG engine—operates only on the sources passed forward, dynamically delivering an evidence-backed answer (<xref rid="figure5" ref-type="fig">Figure 5</xref>).</p>
          </list-item>
        </list>
        <p>Conceptually, this moves toward similar approaches with agentic LLM setups [<xref ref-type="bibr" rid="ref38">38</xref>], which should be evaluated for trade-off between factuality, completeness, and recency in the future.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Conceptual domain-specific question-answer-pipeline balancing source control and recency. A proposed 2-stage architecture for future clinical question answering: incoming queries are first processed by a “Source Selector” module that dynamically retrieves, ranks, and whitelists evidence according to question type and medical specialty, which would ensure both domain relevance and up-to-date information. This could range from constituting a simple database to more complex models, potentially containing a large language model (LLM) + retrieval-augmented generation (RAG) system itself. The curated set of sources then feeds into the “whitelisting” part of an LLM + online RAG system to generate a structured, evidence-backed response.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e79379_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>Restricting retrieval improved correctness by 8 to 18 percentage points and reduced variability across all 3 web-RAG models, matching a dedicated professional literature-only engine. At the same time, the impact of source quality appears to be model-dependent. Taken together, retrieval control appears to be a low-friction safety lever for web-grounded LLMs with predefined authoritative sources. Further tests on generalizability and inclusion of more complex agentic systems are warranted.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Supplemental materials, including example answers, additional graphical representations, and more detailed tabular statistics.</p>
        <media xlink:href="jmir_v27i1e79379_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 632 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AAN</term>
          <def>
            <p>American Academy of Neurology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">OR</term>
          <def>
            <p>odds ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">RAG</term>
          <def>
            <p>retrieval-augmented generation</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>LM was supported by Deutsche Forschungsgemeinschaft (German Research Foundation; 493659010) and the B. Braun Foundation. Computing expenses were further generously supported by the society of friends and supporters of Düsseldorf’s university (Gesellschaft von Freunden und Förderern der HHU).</p>
      <p>SGM is funded by the German Ministry for Education and Research (BMBF), German Federal Institute for Risk Assessment, German Research Foundation, Else Kröner Fresenius Foundation, Gemeinsamer Bundesausschuss, German Academic Exchange Service, Hertie Foundation, Interdisciplinary Center for Clinical Studies Muenster, German Foundation Neurology, Ministry of Culture and Science of the State of North Rhine-Westphalia, The Daimler and Benz Foundation, Multiple Sclerosis Society North Rhine-Westphalia Regional Association, Peek &#38; Cloppenburg Düsseldorf Foundation, and the Hempel Foundation for Science, Art and Welfare.</p>
      <p>MP was supported by German Alzheimer Society e.V. Dementia self-help and the B. Braun Foundation.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data and the underlying code for this study can be found in the respective GitHub repository [<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>LM, PZE, SGM, and MP designed the study. LM and PZE did the technical work to gather the data. LM and MP rated the answers with SGM as the tie-breaking vote. PZE classified the sources. LM performed the technical analyses. LM, PZE, MP, and SGM drafted and edited the manuscript. All authors have read and approved the manuscript. All authors had access to the underlying data.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Azar</surname>
              <given-names>WS</given-names>
            </name>
            <name name-style="western">
              <surname>Junkin</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Hesswani</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Koller</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Parikh</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Schuppe</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Nethala</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mendhiratta</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kenigsberg</surname>
              <given-names>AP</given-names>
            </name>
            <name name-style="western">
              <surname>Turkbey</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Merino</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Zaki</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Cortner</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gurram</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pinto</surname>
              <given-names>PA</given-names>
            </name>
          </person-group>
          <article-title>LLM-mediated data extraction from patient records after radical prostatectomy</article-title>
          <source>NEJM AI</source>
          <year>2025</year>
          <month>05</month>
          <day>22</day>
          <volume>2</volume>
          <issue>6</issue>
          <fpage>AIcs2400943</fpage>
          <pub-id pub-id-type="doi">10.1056/AIcs2400943</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masanneck</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Seifert</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kölsche</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Huntemann</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Jansen</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Mehsin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bernhard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Meuth</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Böhm</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pawlitzki</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Triage performance across large language models, ChatGPT, and untrained doctors in emergency medicine: comparative study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <month>06</month>
          <day>14</day>
          <volume>26</volume>
          <fpage>e53297</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e53297/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/53297</pub-id>
          <pub-id pub-id-type="medline">38875696</pub-id>
          <pub-id pub-id-type="pii">v26i1e53297</pub-id>
          <pub-id pub-id-type="pmcid">PMC11214027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ferber</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wiest</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Wölflein</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ebert</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Beutel</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Eckardt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Springfeld</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jäger</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kather</surname>
              <given-names>JN</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 for information retrieval and comparison of medical oncology guidelines</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <month>05</month>
          <day>23</day>
          <volume>1</volume>
          <issue>6</issue>
          <fpage>AIcs230023</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://paperpile.com/b/02hFX7/7OLA8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/aics2300235</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kather</surname>
              <given-names>JN</given-names>
            </name>
            <name name-style="western">
              <surname>Ferber</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wiest</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Truhn</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Large language models could make natural language again the universal interface of healthcare</article-title>
          <source>Nat Med</source>
          <year>2024</year>
          <month>10</month>
          <day>23</day>
          <volume>30</volume>
          <issue>10</issue>
          <fpage>2708</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-024-03199-w</pub-id>
          <pub-id pub-id-type="medline">39179856</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-024-03199-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gaber</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Shaik</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Allega</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Bilecz</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Busch</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Goon</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Franke</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Akalin</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluating large language model workflows in clinical decision support for triage and referral and diagnosis</article-title>
          <source>NPJ Digit Med</source>
          <year>2025</year>
          <month>05</month>
          <day>09</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>263</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-025-01684-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-025-01684-1</pub-id>
          <pub-id pub-id-type="medline">40346344</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-025-01684-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC12064692</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A survey on hallucination in large language models: principles, taxonomy, challenges, and open questions</article-title>
          <source>ACM Trans Inf Syst</source>
          <year>2025</year>
          <month>01</month>
          <day>24</day>
          <volume>43</volume>
          <issue>2</issue>
          <fpage>1</fpage>
          <lpage>55</lpage>
          <pub-id pub-id-type="doi">10.1145/3703155</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kankanhalli</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Hallucination is inevitable: an innate limitation of large language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 22, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2401.11817"/>
          </comment>
          <pub-id pub-id-type="doi">10.5860/choice.189890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Perez</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Piktus</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Petroni</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Karpukhin</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Goyal</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Küttler</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yih</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Rocktäschel</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Riedel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kiela</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-augmented generation for knowledge-intensive NLP task</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 22, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2401.11817"/>
          </comment>
          <pub-id pub-id-type="doi">10.5860/choice.189890</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zakka</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shad</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chaurasia</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dalal</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Moor</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fong</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Phillips</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ashley</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hirsch</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Langlotz</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Melia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nelson</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Tullis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Vogelsong</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Cunningham</surname>
              <given-names>JP</given-names>
            </name>
            <name name-style="western">
              <surname>Hiesinger</surname>
              <given-names>W</given-names>
            </name>
          </person-group>
          <article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <month>02</month>
          <volume>1</volume>
          <issue>2</issue>
          <fpage>1056</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38343631"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id>
          <pub-id pub-id-type="medline">38343631</pub-id>
          <pub-id pub-id-type="pmcid">PMC10857783</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Benchmarking retrieval-augmented generation for medicine</article-title>
          <source>Proceedings of the 2024 Association for Computational Linguistics</source>
          <year>2024</year>
          <conf-name>ACL '24</conf-name>
          <conf-date>August 11-16, 2024</conf-date>
          <conf-loc>Bangkok, Thailand</conf-loc>
          <fpage>6233</fpage>
          <lpage>51</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2024.findings-acl.372.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.372</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fernández-Pichel</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pichel</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Losada</surname>
              <given-names>DE</given-names>
            </name>
          </person-group>
          <article-title>Evaluating search engines and large language models for answering health questions</article-title>
          <source>NPJ Digit Med</source>
          <year>2025</year>
          <month>03</month>
          <day>10</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>153</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-025-01546-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-025-01546-w</pub-id>
          <pub-id pub-id-type="medline">40065094</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-025-01546-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC11894092</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="web">
          <article-title>API platform</article-title>
          <source>Perplexity</source>
          <access-date>2024-12-24</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://docs.perplexity.ai/home">https://docs.perplexity.ai/home</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masanneck</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Meuth</surname>
              <given-names>SG</given-names>
            </name>
            <name name-style="western">
              <surname>Pawlitzki</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Evaluating base and retrieval augmented LLMs with document or online support for evidence based neurology</article-title>
          <source>NPJ Digit Med</source>
          <year>2025</year>
          <month>03</month>
          <day>04</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>137</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-025-01536-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-025-01536-y</pub-id>
          <pub-id pub-id-type="medline">40038423</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-025-01536-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC11880332</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Varghese</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Alen</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Fujarski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Schlake</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Sucker</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Warnecke</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Sensor validation and diagnostic potential of smartwatches in movement disorders</article-title>
          <source>Sensors (Basel)</source>
          <year>2021</year>
          <month>04</month>
          <day>30</day>
          <volume>21</volume>
          <issue>9</issue>
          <fpage>3139</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s21093139"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s21093139</pub-id>
          <pub-id pub-id-type="medline">33946494</pub-id>
          <pub-id pub-id-type="pii">s21093139</pub-id>
          <pub-id pub-id-type="pmcid">PMC8124167</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <article-title>Generative AI in search: let Google do the searching for you</article-title>
          <source>Google</source>
          <access-date>2025-05-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://blog.google/products/search/generative-ai-google-search-may-2024/">https://blog.google/products/search/generative-ai-google-search-may-2024/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing ChatGPT search</article-title>
          <source>OpenAI</source>
          <access-date>2025-05-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/index/introducing-chatgpt-search/">https://openai.com/index/introducing-chatgpt-search/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <article-title>Claude can now search the web</article-title>
          <source>Anthropic</source>
          <access-date>2025-05-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.anthropic.com/news/web-search">https://www.anthropic.com/news/web-search</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dewan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gautam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>LLM-driven usefulness judgment for web search evaluation</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 19, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2401.11817"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2504.14401</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>Home page</article-title>
          <source>OpenEvidence</source>
          <access-date>2025-05-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.openevidence.com">https://www.openevidence.com</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="web">
          <article-title>Meet new Sonar: a blazing fast model optimized for perplexity search</article-title>
          <source>Perplexity</source>
          <access-date>2025-05-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.perplexity.ai/hub/blog/meet-new-sonar">https://www.perplexity.ai/hub/blog/meet-new-sonar</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="web">
          <article-title>Today we're open-sourcing R1 1776, a version of the DeepSeek-R1 model that has been post-trained to provide unbiased, accurate, and factual information</article-title>
          <source>Perplexity</source>
          <access-date>2025-05-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.perplexity.ai/hub/blog/open-sourcing-r1-1776">https://www.perplexity.ai/hub/blog/open-sourcing-r1-1776</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <source>MasanneckLab / Online-Search-Based-RAG</source>
          <access-date>2025-10-22</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/MasanneckLab/Online-Search-Based-RAG">https://github.com/MasanneckLab/Online-Search-Based-RAG</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <article-title>(Muster-)Berufsordnung für die in Deutschland tätigen Ärztinnen und Ärzte (MBO-Ä)</article-title>
          <source>Bundesärztekammer</source>
          <access-date>2025-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bundesaerztekammer.de/fileadmin/user_upload/BAEK/Themen/Recht/_Bek_BAEK_Musterberufsordnung-AE.pdf">https://www.bundesaerztekammer.de/fileadmin/user_upload/BAEK/Themen/Recht/_Bek_BAEK_Musterberufsordnung-AE.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bicknell</surname>
              <given-names>BT</given-names>
            </name>
            <name name-style="western">
              <surname>Butler</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Whalen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ricks</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Dixon</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Spaedy</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Skelton</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Edupuganti</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dzubinski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tate</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Dyess</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lindeman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>LS</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT-4 omni performance in USMLE disciplines and clinical skills: comparative analysis</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <month>11</month>
          <day>06</day>
          <volume>10</volume>
          <fpage>e63430</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e63430/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/63430</pub-id>
          <pub-id pub-id-type="medline">39504445</pub-id>
          <pub-id pub-id-type="pii">v10i1e63430</pub-id>
          <pub-id pub-id-type="pmcid">PMC11611793</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>08</month>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Haycocks</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez-Moreno</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bester</surname>
              <given-names>JC</given-names>
            </name>
            <name name-style="western">
              <surname>Hernandez</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kalili</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Samrao</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Simanton</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Vida</surname>
              <given-names>TA</given-names>
            </name>
          </person-group>
          <article-title>Assessing the difficulty and long-term retention of factual and conceptual knowledge through multiple-choice questions: a longitudinal study</article-title>
          <source>Adv Med Educ Pract</source>
          <year>2024</year>
          <volume>15</volume>
          <fpage>1217</fpage>
          <lpage>28</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.2147/AMEP.S478193?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/AMEP.S478193</pub-id>
          <pub-id pub-id-type="medline">39697781</pub-id>
          <pub-id pub-id-type="pii">478193</pub-id>
          <pub-id pub-id-type="pmcid">PMC11653852</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tordjman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Yuce</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fauveau</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Mei</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Hadjadj</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bolger</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Almansour</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Horst</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Parihar</surname>
              <given-names>AS</given-names>
            </name>
            <name name-style="western">
              <surname>Geahchan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Meribout</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yatim</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Robson</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lewis</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deyer</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Taouli</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Fayad</surname>
              <given-names>ZA</given-names>
            </name>
            <name name-style="western">
              <surname>Mei</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Comparative benchmarking of the DeepSeek large language model on medical tasks and clinical reasoning</article-title>
          <source>Nat Med</source>
          <year>2025</year>
          <month>08</month>
          <volume>31</volume>
          <issue>8</issue>
          <fpage>2550</fpage>
          <lpage>5</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-025-03726-3</pub-id>
          <pub-id pub-id-type="medline">40267969</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-025-03726-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arora</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hicks</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bowman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Quinonero-Candela</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tsimpourlas</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Sharman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vallone</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Beutel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Heidecke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>HealthBench: evaluating large language models towards improved human health</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 23, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2505.08775"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <article-title>Introducing HealthBench</article-title>
          <source>OpenAI</source>
          <access-date>2025-05-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/index/healthbench/">https://openai.com/index/healthbench/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The EU passes the AI Act and its implications for digital medicine are unclear</article-title>
          <source>NPJ Digit Med</source>
          <year>2024</year>
          <month>05</month>
          <day>22</day>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>135</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01116-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01116-6</pub-id>
          <pub-id pub-id-type="medline">38778162</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01116-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC11111757</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Freyer</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Wiest</surname>
              <given-names>IC</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Policing the boundary between responsible and irresponsible placing on the market of large language model health applications</article-title>
          <source>Mayo Clin Proc Digit Health</source>
          <year>2025</year>
          <month>03</month>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>100196</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2949-7612(25)00003-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.mcpdig.2025.100196</pub-id>
          <pub-id pub-id-type="medline">40206992</pub-id>
          <pub-id pub-id-type="pii">S2949-7612(25)00003-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC11976008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <article-title>We provide validated information for healthcare professionals</article-title>
          <source>Prof. Valmed by Valmed Universe®</source>
          <access-date>2025-05-23</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://profvalmed.com/">https://profvalmed.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <article-title>Instruction for use</article-title>
          <source>Prof. Valmed by Valmed Universe®</source>
          <access-date>2025-05-27</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://profvalmed.com/instruction-for-use/">https://profvalmed.com/instruction-for-use/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Onitiu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wachter</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mittelstadt</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>How AI challenges the medical device regulation: patient safety, benefits, and intended uses</article-title>
          <source>J Law Biosci</source>
          <year>2024</year>
          <fpage>e007</fpage>
          <pub-id pub-id-type="doi">10.1093/jlb/lsae007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Harvey</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Melvin</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Vollebregt</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wicks</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Large language model AI chatbots require approval as medical devices</article-title>
          <source>Nat Med</source>
          <year>2023</year>
          <month>10</month>
          <day>30</day>
          <volume>29</volume>
          <issue>10</issue>
          <fpage>2396</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-023-02412-6</pub-id>
          <pub-id pub-id-type="medline">37391665</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-023-02412-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hussain</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Binz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mata</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Wulff</surname>
              <given-names>DU</given-names>
            </name>
          </person-group>
          <article-title>A tutorial on open-source large language models for behavioral science</article-title>
          <source>Behav Res Methods</source>
          <year>2024</year>
          <month>12</month>
          <day>15</day>
          <volume>56</volume>
          <issue>8</issue>
          <fpage>8214</fpage>
          <lpage>37</lpage>
          <pub-id pub-id-type="doi">10.3758/s13428-024-02455-8</pub-id>
          <pub-id pub-id-type="medline">39147947</pub-id>
          <pub-id pub-id-type="pii">10.3758/s13428-024-02455-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC11525391</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chase</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>LangChain</article-title>
          <source>GitHub</source>
          <access-date>2025-09-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/langchain-ai/langchain">https://github.com/langchain-ai/langchain</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Das</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Maheswari</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Siddiqui</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Arora</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Paul</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nanshi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Udbalkar</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Sarvade</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chaturvedi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Shvartsman</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Masih</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Thippeswamy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Patil</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nirni</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Garsson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Bandyopadhyay</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Maulik</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Farooq</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sengupta</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Improved precision oncology question-answering using agentic LLM</article-title>
          <source>medRxiv</source>
          <comment>Preprint posted online on September 24, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2024.09.20.24314076v1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2024.09.20.24314076</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
