<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e88614</article-id><article-id pub-id-type="doi">10.2196/88614</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Benchmarking Large Language Models and Prompt Engineering Strategies in Microsatellite Instability Cancers: Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Zhang</surname><given-names>Yuxin</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Song</surname><given-names>Jie</given-names></name><degrees>BE</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bi</surname><given-names>Cheng</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zheng</surname><given-names>Xin</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Zhichuan</given-names></name><degrees>BE</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Cao</surname><given-names>Dan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Shen</surname><given-names>Bairong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Medical Oncology, Institutes for Systems Genetics, Frontiers Science Center for Disease-Related Molecular Network, West China Hospital, Sichuan University</institution><addr-line>No 2222 Xinchuan Road, Gaoxin District</addr-line><addr-line>Chengdu</addr-line><addr-line>Sichuan</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lai</surname><given-names>Xiangxun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lee</surname><given-names>Young</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Bairong Shen, PhD, Department of Medical Oncology, Institutes for Systems Genetics, Frontiers Science Center for Disease-Related Molecular Network, West China Hospital, Sichuan University, No 2222 Xinchuan Road, Gaoxin District, Chengdu, Sichuan, 610000, China, 86 15995854635, 86 28 61528682; <email>Bairong.shen@scu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>21</day><month>5</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e88614</elocation-id><history><date date-type="received"><day>28</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>10</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>10</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yuxin Zhang, Jie Song, Cheng Bi, Xin Zheng, Zhichuan Xu, Dan Cao, Bairong Shen. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 21.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e88614"/><abstract><sec><title>Background</title><p>The reliability of general-purpose large language models (LLMs) for complex clinical tasks in specialized domains such as microsatellite instability (MSI) cancers remains critically uncharacterized. The absence of a domain-specific benchmark to evaluate and guide the optimization of their capabilities across diverse clinical tasks poses unevaluated risks to patient safety.</p></sec><sec><title>Objective</title><p>This study aimed to develop and validate Microsatellite Instability Cancer Benchmark (MSIC-Bench), a novel, two-tiered benchmark for MSI cancer, covering both consensus and frontier knowledge. Using this framework, we aimed to systematically assess LLM performance across various prompting strategies, identify task-specific weaknesses, and reveal effective pathways for performance improvement.</p></sec><sec sec-type="methods"><title>Methods</title><p>We developed MSIC-Bench, a 511-question benchmark derived from clinical guidelines and a curated knowledge base. Three state-of-the-art LLMs (GPT-4o, [OpenAI], Gemini 2.5 Pro [Google], and Claude Opus 4 [Anthropic]) were evaluated using 4 prompting strategies, including vanilla, chain-of-thought, reflection of thoughts, and retrieval-augmented generation (RAG), under both multiple-choice and open-ended modalities. Performance was assessed on accuracy, safety (honesty), error composition, and token usage.</p></sec><sec sec-type="results"><title>Results</title><p>LLMs demonstrated a significant &#x201C;scaffolding effect,&#x201D; with accuracy dropping substantially in open-ended scenarios. For non-RAG strategies, the primary failure mode was an internal knowledge deficit. The integration of RAG proved to be the most effective intervention. A domain-aligned RAG strategy not only significantly improved accuracy in complex decision-making tasks but also fundamentally shifted the system&#x2019;s primary bottleneck from knowledge deficits to retrieval failures. In terms of safety, RAG induced a favorable shift from high-risk fabrications to safer refusals, though this introduced a safety-utility trade-off in the form of false refusals. Notably, our hybrid-RAG configuration, which combined both knowledge sources, demonstrated the most robust and generalizable performance across all tasks.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Current LLMs lack the specialized knowledge required for reliable application in MSI oncology. A well-designed RAG architecture is the pivotal intervention to address this gap. However, its success is not automatic; it transforms the nature of system failure, making retrieval precision and knowledge base quality the new critical determinants of performance and safety. Our findings establish a clear directive for developing trustworthy clinical artificial intelligence: focus must shift toward optimizing the retrieval component and curating high-quality, comprehensive knowledge sources. MSIC-Bench provides a robust framework to guide these future efforts.</p></sec></abstract><kwd-group><kwd>microsatellite instability</kwd><kwd>cancer</kwd><kwd>large language model</kwd><kwd>LLM</kwd><kwd>prompt engineering</kwd><kwd>benchmark</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Genomic instability is a core hallmark of cancer [<xref ref-type="bibr" rid="ref1">1</xref>] and uniquely bridges cancer and aging [<xref ref-type="bibr" rid="ref2">2</xref>], reflecting its central role in cell fate decisions. Microsatellite instability (MSI), a major form of genomic instability, serves as a crucial pan-cancer biomarker with significant diagnostic, prognostic, and therapeutic value [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. However, significant challenges remain in understanding the heterogeneity of MSI across cancer types, accurately detecting and classifying MSI in clinical settings, and optimizing personalized therapeutic strategies for MSI-positive patients [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Advances in artificial intelligence (AI), particularly deep learning, have transformed MSI detection [<xref ref-type="bibr" rid="ref7">7</xref>] and treatment response prediction [<xref ref-type="bibr" rid="ref8">8</xref>], with growing applications in the diagnosis and management of MSI-associated cancers [<xref ref-type="bibr" rid="ref9">9</xref>]. In recent years, large language models (LLMs) such as GPT-4 (OpenAI) [<xref ref-type="bibr" rid="ref10">10</xref>], Claude (Anthropic) [<xref ref-type="bibr" rid="ref11">11</xref>], and Gemini (Google) [<xref ref-type="bibr" rid="ref12">12</xref>], built on transformer architectures with billions of parameters, have driven innovation in health care through their exceptional abilities in understanding, reasoning, and generation. These LLMs have shown significant potential in various medical fields, including rare disease diagnosis [<xref ref-type="bibr" rid="ref13">13</xref>], sepsis management [<xref ref-type="bibr" rid="ref14">14</xref>], and medical education [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>However, the application of LLMs in MSI-related cancer care remains largely unexplored. In addition, the lack of standardized benchmarks for systematically evaluating their performance on MSI-specific tasks limits their clinical adoption and further development [<xref ref-type="bibr" rid="ref16">16</xref>].</p><p>To fill this critical research gap, we designed and constructed Microsatellite Instability Cancer Benchmark (MSIC-Bench), a novel, domain-specific evaluation framework. The architecture of MSIC-Bench is explicitly engineered to separate and assess 2 distinct cognitive abilities. It comprises two core tiers: (1) the basic tier, derived from established clinical guidelines, evaluates an LLM&#x2019;s mastery of foundational, consensus-based knowledge, and (2) the advanced tier, built upon our curated, evidence-based knowledge base, the Microsatellite Instability Cancer Knowledgebase (MSICKB) [<xref ref-type="bibr" rid="ref17">17</xref>], assesses an LLM&#x2019;s capacity to comprehend and reason with complex, frontier scientific evidence. Each tier is further categorized into key clinical subtasks, including molecular basis, diagnosis, treatment, and prognosis, ensuring a comprehensive assessment of the required knowledge spectrum.</p><p>Using this benchmark, we conducted a systematic evaluation of 3 state-of-the-art LLMs (GPT-4o [OpenAI], Gemini 2.5 Pro [Google], and Claude Opus 4 [Anthropic]) across 4 distinct prompting strategies. We assessed their performance under different prompting strategies designed to probe their internal knowledge and reasoning, including a vanilla zero-shot approach, a step-by-step reasoning approach (chain-of-thought [CoT] [<xref ref-type="bibr" rid="ref18">18</xref>]), and a multipersona reflective approach (reflection of thoughts [RoT] [<xref ref-type="bibr" rid="ref19">19</xref>]). Crucially, we also evaluated their capabilities when integrated with a retrieval-augmented generation (RAG) architecture [<xref ref-type="bibr" rid="ref20">20</xref>], providing them with access to our curated knowledge base.</p><p>Our investigation was structured along two critical dimensions: (1) the &#x201C;basic vs Advanced&#x201D; dimension, to distinguish between mastery of consensus knowledge and comprehension of frontier science; and (2) the &#x201C;multitask&#x201D; dimension, to evaluate performance across key clinical subtasks (eg, diagnosis and treatment). This dual-dimensional analysis aimed to answer a central question: To what extent can LLMs provide accurate, reliable, and safe answers within the MSI cancer domain?</p><p>This study addresses these gaps through several integrated efforts. We first developed and validated MSIC-Bench, a new, expert-curated benchmark comprising 511 questions designed to assess LLMs on both foundational and frontier knowledge in MSI-related cancer. Using this benchmark, our multifaceted evaluation framework systematically assessed leading LLMs not just for accuracy, but also for AI safety&#x2014;distinguishing justified from false refusals&#x2014;and for error composition, pinpointing critical system bottlenecks.</p><p>Our evaluation yields several critical insights. We reveal that the primary bottleneck for standard LLMs is a profound deficit in specialized knowledge. While RAG is the critical intervention to address this, our analysis shows that it fundamentally shifts the bottleneck from knowledge to information retrieval, where &#x201C;retrieval failure&#x201D; becomes the new dominant error mode. Furthermore, we establish a safety-utility trade-off in RAG systems. We demonstrate that while RAG transforms high-risk &#x201C;fabrications&#x201D; into safer refusals, it can also introduce &#x201C;false refusals&#x201D;&#x2014;a new, utility-degrading error type. Finally, our findings provide actionable insights for developing more robust systems, showing that an RAG architecture that integrates both broad clinical guidelines and specialized knowledge offers a practical and effective solution. Together, these findings not only map the current capabilities and limitations of LLMs in oncology but also provide a clear, evidence-based roadmap for their future development and safe clinical integration.</p><p>To realize these contributions, we designed a comprehensive, multistage study. The overall workflow is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>, encompassing three core components: (1) benchmark construction, (2) benchmark content, and (3) experimental setup.</p><p>Therefore, the primary aims of this study were twofold: first, to develop and validate MSIC-Bench, a novel, domain-specific benchmark to fill a critical gap in evaluating LLMs for MSI cancer care; and second, to use this benchmark to systematically evaluate the capabilities and limitations of state-of-the-art LLMs and investigate the effectiveness of different RAG configurations in mitigating their identified weaknesses.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Schematic overview of the Microsatellite Instability Cancer Benchmark (MSIC-Bench) study design. AI: artificial intelligence; Q and A: question and answer.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig01.png"/></fig></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Construction of MSIC-Bench</title><sec id="s2-1-1"><title>Overview</title><p>The construction of MSIC-Bench followed a rigorous, multistage process designed to ensure its clinical relevance, scientific accuracy, and comprehensive coverage.</p></sec><sec id="s2-1-2"><title>Data Sources</title><p>The MSIC-Bench was designed to separate the assessment of 2 critical LLM capabilities: generalization on foundational clinical knowledge and tracking of frontier scientific evidence. The basic tier was derived from a curated collection of the most recent and authoritative clinical practice guidelines from the National Comprehensive Cancer Network (NCCN) and the European Society for Medical Oncology (ESMO), as detailed in <xref ref-type="table" rid="table1">Table 1</xref>. These guidelines were selected to represent the stable, consensus-based knowledge required for standard clinical care.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Clinical practice guidelines used as data sources for the basic tier.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Institution and guideline name</td><td align="left" valign="bottom">Version</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="2">NCCN<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Colon cancer [<xref ref-type="bibr" rid="ref21">21</xref>]</td><td align="left" valign="top">Version 3.2025 (April 24, 2025)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Colorectal cancer screening [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">Version 1.2025 (May 30, 2025)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gastric cancer [<xref ref-type="bibr" rid="ref23">23</xref>]</td><td align="left" valign="top">Version 2.2025 (April 4, 2025)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Rectal cancer [<xref ref-type="bibr" rid="ref24">24</xref>]</td><td align="left" valign="top">Version 2.2025 (March 31, 2025)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Uterine neoplasms [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">Version 3.2025 (March 7, 2025)</td></tr><tr><td align="left" valign="top" colspan="2">ESMO<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Endometrial cancer [<xref ref-type="bibr" rid="ref26">26</xref>]</td><td align="left" valign="top">2022.05</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Gastric cancer [<xref ref-type="bibr" rid="ref27">27</xref>]</td><td align="left" valign="top">2022.07</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Localized colon cancer [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">2020.06</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Localized rectal cancer [<xref ref-type="bibr" rid="ref29">29</xref>]</td><td align="left" valign="top">2025.05</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Metastatic colorectal cancer [<xref ref-type="bibr" rid="ref30">30</xref>]</td><td align="left" valign="top">2022.10</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>NCCN: National Comprehensive Cancer Network.</p></fn><fn id="table1fn2"><p><sup>b</sup>ESMO: European Society for Medical Oncology.</p></fn></table-wrap-foot></table-wrap><p>Specifically, the advanced tier of MSIC-Bench was constructed using MSICKB, a comprehensive knowledge resource. MSICKB was developed to address the fragmentation and limited scope of data in MSI cancer research. By systematically curating 492 peer-reviewed publications, MSICKB provides a structured, multidimensional repository that goes beyond simple gene lists. It includes genetic and molecular features (such as mutations, expression levels, and epigenetic changes), clinicopathological characteristics (such as tumor location and histology), explicit prognostic data (including prognostic factors and survival outcomes), and therapeutic responses to various treatments. Unlike large-scale databases such as The Cancer Genome Atlas [<xref ref-type="bibr" rid="ref31">31</xref>], MSICKB offers granular, expert-curated annotations that capture the complexity and heterogeneity of MSI cancers. This comprehensive resource enables the creation of advanced, clinically nuanced questions essential for evaluating LLM performance.</p></sec><sec id="s2-1-3"><title>Expert Curation</title><p>The design and curation of MSIC-Bench were an iterative process led by 2 clinical experts, whose detailed qualifications are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>For the basic tier, the 2 experts conducted a comprehensive review of the selected guidelines. They systematically identified core knowledge points across key clinical domains and collaboratively authored a series of multiple-choice and true or false questions to cover these fundamental aspects. Each question was repeatedly refined to ensure strict adherence to guideline content, avoid ambiguity, and maintain an appropriate level of difficulty.</p><p>For the advanced tier, the 2 experts used the MSICKB knowledge base and its corresponding source literature. They meticulously selected and extracted pivotal findings related to molecular features, phenotypic characteristics, therapeutic responses, and prognostic factors. Each question was then carefully crafted to reflect the latest advancements and unresolved challenges in the field, ensuring it represented the current research frontier.</p><p>Following the initial drafting, all 511 QA pairs underwent a final consensus validation by the 2 experts to guarantee their accuracy, clarity, and relevance before being finalized for the benchmark. The final basic tier consists of 142 multiple-choice and 108 true or false questions, while the advanced tier comprises 261 multiple-choice questions. Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> presents representative examples from the benchmark, illustrating the structure of questions across different tiers and task categories.</p><p>To enable a granular analysis of the capabilities of LLMs, all questions in the benchmark were categorized based on their clinical and scientific domain. The basic tier questions were classified into 5 key areas: molecular and genetic basis, diagnosis and testing, clinicopathological correlation, treatment, and prognosis. The advanced tier questions were classified into 4 primary tasks, further broken down into more specific subtasks: phenotypic features (eg, cancer characteristics and patient characteristics), molecular features (eg, gene and genome), therapeutic response (eg, immunotherapy, chemotherapy, and surgery), and prognostic factors (eg, genomic and molecular features). This multilayered categorization is a core design feature of MSIC-Bench, enabling a detailed performance evaluation that moves beyond a single overall accuracy score to reveal the specific strengths and weaknesses of LLMs across tasks. The exact number of questions per subtask is provided in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>In addition to the primary benchmark of selection-based questions, a secondary set of open-ended questions was created, termed the without selections modality. This set was derived directly from the primary benchmark by removing the choice options. This dual-modality design serves a crucial purpose: the selection-based questions assess an LLM&#x2019;s discriminative ability to identify the correct answer from a predefined set of options, while the open-ended questions challenge its generative and recall capabilities to formulate an answer autonomously. This allowed for a direct comparison of LLM performance with and without the &#x201C;scaffolding&#x201D; of selection options, providing a more comprehensive evaluation of their reasoning abilities.</p></sec></sec><sec id="s2-2"><title>Experimental Setup</title><sec id="s2-2-1"><title>Overview</title><p>To evaluate LLM performance on MSIC-Bench, we designed a comprehensive experimental framework. We benchmarked 3 leading models across 4 distinct prompting strategies: vanilla, CoT, RoT, and RAG (basic-RAG, advanced-RAG, and hybrid-RAG).</p></sec><sec id="s2-2-2"><title>LLMs and Parameters</title><p>We evaluated 3 state-of-the-art LLMs: GPT-4o [<xref ref-type="bibr" rid="ref10">10</xref>], Claude Opus 4 [<xref ref-type="bibr" rid="ref11">11</xref>], and Gemini 2.5 Pro [<xref ref-type="bibr" rid="ref32">32</xref>]. For full transparency and reproducibility, detailed information on the specific model versions, application programming interface (API) identifiers, and knowledge cutoff dates is provided in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. All API calls were executed using consistent parameter settings: temperature was set to 0.3 to ensure reproducibility while allowing for minor stylistic variation, and max_tokens was set to 4096 to prevent premature truncation. A retry mechanism with up to 5 attempts was implemented to handle transient API failures.</p></sec><sec id="s2-2-3"><title>Prompting Strategies</title><p>We benchmarked each model against 4 prompting strategies, designed to probe reasoning capabilities under different conditions of knowledge reliance (<xref ref-type="table" rid="table2">Table 2</xref>). The full, unabridged system prompts used for each strategy are provided in Section S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Prompt templates for the 4 evaluated strategies.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Prompting strategy</td><td align="left" valign="bottom">Template</td></tr></thead><tbody><tr><td align="left" valign="top">Vanilla</td><td align="left" valign="top">You are a biomedical expert. Answer the following question based on your internal knowledge. Question: {question}.</td></tr><tr><td align="left" valign="top">CoT<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">You are a biomedical expert. Answer the following question. First, provide your step-by-step reasoning process. Then, provide the final answer. Let&#x2019;s think step by step. Question: {question}, Reasoning: [Your step-by-step reasoning here], and Final Answer: [Your answer here].</td></tr><tr><td align="left" valign="top">RoT<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td><td align="left" valign="top">You are a biomedical expert. Answer the following question. Question: {question}. Imagine 3 medical experts are solving this task. Each expert independently provides their step-by-step reasoning and final answer. After all experts have finished, they discuss together, review, and backtrack their previous reasoning steps, and finally reach a consensus on the final answer. Please present: [Expert 1&#x2019;s reasoning and answer], [Expert 2&#x2019;s reasoning and answer], [Expert 3&#x2019;s reasoning and answer], and [The discussion and the agreed final answer].</td></tr><tr><td align="left" valign="top">RAG<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">You are a biomedical expert. Answer the following question based on the provided clinical guideline context and your internal knowledge. Question: {question}. Context: {context}.</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>CoT: chain-of-thought.</p></fn><fn id="table2fn2"><p><sup>b</sup>RoT: reflection of thoughts.</p></fn><fn id="table2fn3"><p><sup>c</sup>RAG: retrieval-augmented generation.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-2-4"><title>RAG Pipeline Implementation Details</title><p>To assess the impact of external knowledge, we implemented 3 distinct RAG configurations, each differing by its underlying knowledge source: basic-RAG (using established clinical guidelines), advanced-RAG (based on our curated MSICKB), and hybrid-RAG (combining both knowledge sources).</p><p>These configurations allowed us to systematically evaluate performance across different knowledge tiers. To ensure a controlled comparison and facilitate reproducibility, all critical pipeline parameters were standardized across these configurations. The specific details of each RAG system, including knowledge source, chunking strategy, embedding model, and retrieval methodology, are summarized in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Configuration parameters for retrieval-augmented generation (RAG) systems.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Parameter</td><td align="left" valign="bottom">Basic RAG<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup> (guideline-based)</td><td align="left" valign="bottom">Advanced RAG (MSICKB<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>-based)</td><td align="left" valign="bottom">Hybrid RAG (unified)</td></tr></thead><tbody><tr><td align="left" valign="top">Knowledge source</td><td align="left" valign="top">Unstructured clinical guideline PDFs</td><td align="left" valign="top">Structured MSICKB (JSON lines)</td><td align="left" valign="top">Combined guidelines and MSICKB</td></tr><tr><td align="left" valign="top">Document representation</td><td align="left" valign="top">Text chunks from parsed PDFs</td><td align="left" valign="top">Each JSON line as a distinct document</td><td align="left" valign="top">Hybrid of text chunks and JSON lines</td></tr><tr><td align="left" valign="top">Chunking strategy</td><td align="left" valign="top">RecursiveCharacterTextSplitter</td><td align="left" valign="top">N/A (Atomic document unit)</td><td align="left" valign="top">RecursiveCharacterTextSplitter (for guidelines)</td></tr><tr><td align="left" valign="top">Chunk size</td><td align="left" valign="top">1000 characters</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">1000 characters (for guidelines)</td></tr><tr><td align="left" valign="top">Chunk overlap</td><td align="left" valign="top">200 characters</td><td align="left" valign="top">N/A<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">200 characters (for guidelines)</td></tr><tr><td align="left" valign="top">Embedding model</td><td align="left" valign="top">text-embedding-3-small</td><td align="left" valign="top">text-embedding-3-small</td><td align="left" valign="top">text-embedding-3-small</td></tr><tr><td align="left" valign="top">Retrieval method</td><td align="left" valign="top">Cosine similarity</td><td align="left" valign="top">Cosine similarity</td><td align="left" valign="top">Cosine similarity</td></tr><tr><td align="left" valign="top">Top-k</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td><td align="left" valign="top">3</td></tr><tr><td align="left" valign="top">Reranking</td><td align="left" valign="top">None</td><td align="left" valign="top">None</td><td align="left" valign="top">None</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>RAG: retrieval-augmented generation.</p></fn><fn id="table3fn2"><p><sup>b</sup>MSICKB: Microsatellite Instability Cancer Knowledgebase</p></fn><fn id="table3fn3"><p><sup>c</sup>For the advanced RAG, each structured entry (JSON line) in the MSICKB was treated as an atomic, indivisible document for embedding and retrieval. Therefore, chunking parameters are not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s2-3"><title>Evaluation Protocol</title><sec id="s2-3-1"><title>Overview</title><p>Our evaluation was conducted across 4 key dimensions: accuracy, AI safety (honesty), error composition, and total token usage. To ensure a rigorous assessment, the dimensions of accuracy, AI safety (honesty), and error composition were manually evaluated by 2 experts. This procedure involved 2 experts independently reviewing all responses, with any initial disagreements resolved through a consensus discussion to establish the final labels. The statistical methods used to validate this interrater reliability are detailed in the &#x201C;Statistical Analysis&#x201D; subsection.</p></sec><sec id="s2-3-2"><title>Accuracy</title><p>Accuracy was assessed against a predefined right answer for each question. A response was marked as &#x201C;correct&#x201D; only if it was factually accurate and semantically equivalent to the ground truth answer. The final accuracy was calculated as the percentage of responses deemed &#x201C;correct.&#x201D;</p></sec><sec id="s2-3-3"><title>Honesty Analysis</title><p>Following the accuracy assessment, all incorrect responses were analyzed to evaluate the model&#x2019;s honesty. We classified these responses based on two primary behaviors applicable to all settings: (1) fabrication: the model provides a factually incorrect answer instead of admitting a knowledge gap. This represents a direct failure of honesty, and (2) justified refusal: the model correctly acknowledges its limitations by explicitly refusing to answer (eg, &#x201C;I don&#x2019;t know&#x201D;). This is considered a desirable safety feature.</p><p>For RAG strategies, the presence of external context introduced a third, more granular category to assess how the model used the provided evidence:</p><list list-type="bullet"><list-item><p>False refusal: this classification is unique to RAG and occurs when the model refuses to answer despite the retrieved context containing the necessary information. It represents a failure in utility, as the model did not correctly leverage the evidence provided.</p></list-item></list></sec><sec id="s2-3-4"><title>Error Composition Analysis</title><p>To diagnose specific failure modes, we designed a hierarchical, single-label annotation scheme for categorizing incorrect responses. For both non-RAG and RAG settings, we used a single-label error annotation scheme with a fixed priority order, which enables well-defined, interpretable comparisons of error distributions. In the non-RAG setting, evaluators consider question misinterpretation, knowledge deficit, and reasoning error in this order and assign the first label whose definition is satisfied. In the RAG setting, evaluators consider question misinterpretation, retrieval failure, context ignorance, and reasoning error in this order and similarly assign the first applicable label. The complete definitions, categories, and priority order for both schemes are detailed in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-3-5"><title>Total Token Usage</title><p>To evaluate the efficiency across LLMs and prompting strategies, we measured the total token usage per interaction, defined as the sum of tokens in the input prompt and the generated output. The input prompt included system instructions, the question, and the retrieved context for RAG variants. The generated output was the model&#x2019;s completion. All texts were tokenized using the tiktoken (OpenAI) library [<xref ref-type="bibr" rid="ref33">33</xref>] with the cl10k_base encoding to provide a single, reproducible counting rule across models. This metric offers a more faithful estimate of resource consumption than response length alone.</p></sec></sec><sec id="s2-4"><title>Statistical Analysis</title><sec id="s2-4-1"><title>Interrater Reliability</title><p>To validate the reliability of the manual annotation process, we calculated the interrater reliability between the 2 domain experts for the dimensions of accuracy, honesty, and error composition. Cohen &#x03BA; coefficient was computed based on the experts&#x2019; initial independent ratings before their consensus discussion. The resulting &#x03BA; values were interpreted using the Landis and Koch [<xref ref-type="bibr" rid="ref34">34</xref>] benchmarks, where scores in the range of 0.61&#x2010;0.80 are considered &#x201C;substantial&#x201D; and 0.81&#x2010;1.00 are &#x201C;almost perfect&#x201D; agreement.</p></sec><sec id="s2-4-2"><title>Significance Testing for Performance Comparison</title><p>To statistically validate performance differences between prompting strategies, we used the exact binomial version of the McNemar test for paired nominal data. This test was used to compare the accuracy of 2 strategies on the same set of questions within specific subtasks. <italic>P</italic>&#x003C;.05 was considered statistically significant.</p></sec></sec><sec id="s2-5"><title>Ethical Considerations</title><p>This study was based exclusively on publicly available data, including published clinical guidelines and peer-reviewed papers. As the research did not involve human participants, it was exempt from Institutional Review Board review, and requirements for informed consent, participant privacy, and compensation were not applicable. All patient information referenced in the source literature was presumed to have been deidentified by the original authors in accordance with established ethical standards.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>Our evaluation protocol involved manual annotation by 2 domain experts for the key dimensions of accuracy, honesty, and error composition. To first establish the reliability of these subjective assessments, we calculated the interrater reliability before any performance analysis. We achieved almost perfect agreement for accuracy (Cohen &#x03BA;=0.903) and honesty (&#x03BA;=0.954), and substantial agreement for error composition (&#x03BA;=0.837), confirming the robustness of our evaluation methodology.</p></sec><sec id="s3-2"><title>Overall Accuracy: Selection Options Boost Performance While RAG Excels in Open-Ended Tasks</title><p>Our primary analysis focused on the overall accuracy of the 3 evaluated LLMs across the basic (<xref ref-type="fig" rid="figure2">Figure 2A</xref>) and advanced (<xref ref-type="fig" rid="figure2">Figure 2B</xref>) tiers. Within each tier, we compared LLM performance under 6 distinct prompting strategies in 2 evaluation modalities: multiple-choice (hatched bars) and open-ended (solid bars).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Overall accuracy of large language models (LLMs) on Microsatellite Instability Cancer Benchmark (MSIC-Bench).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig02.png"/></fig><p>A primary finding is the significant impact of evaluation modality. This &#x201C;scaffolding effect," a term from cognitive science, is shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>. Across nearly all conditions, accuracy in the open-ended modality was substantially lower than in multiple-choice. This drop was most pronounced for non-RAG methods in the basic tier (<xref ref-type="fig" rid="figure2">Figure 2A</xref>), where Claude Opus 4 with the vanilla prompt declined from 93.6% to 71.6%. Predefined selection options transform a demanding generative recall task into a simpler discriminative recognition task; removing this scaffold exposes the LLM&#x2019;s limitations in open-ended scenarios.</p><p>This analysis also highlighted the clear superiority of the RAG strategies, particularly when the &#x201C;scaffolding&#x201D; was removed. In the multiple-choice modality on the basic tier (hatched bars in <xref ref-type="fig" rid="figure2">Figure 2A</xref>), basic RAG was already a top-performing strategy, achieving accuracies of up to 97.2%. However, the superiority of RAG became most pronounced in the more challenging open-ended modality. Across both tiers (solid bars in <xref ref-type="fig" rid="figure2">Figures 2A and B</xref>), all RAG-based methods&#x2014;basic, advanced, and hybrid RAG&#x2014;consistently and significantly outperformed their non-RAG counterparts. For example, in the advanced open-ended setting (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), Claude Opus 4&#x2019;s accuracy increased substantially from 63.6% with RoT to 89.3% with advanced RAG and further to 89.7% with hybrid RAG, representing one of the most substantial performance gains observed in this setting. This underscores that for realistic, open-ended tasks, providing precisely aligned external knowledge is a highly effective strategy for improving accuracy.</p><p>However, our results revealed that the effectiveness of specialized RAG is highly dependent on the alignment between the retrieved context and the question&#x2019;s knowledge domain. This dependency was clearly illustrated by the models&#x2019; performance when presented with out-of-domain context. In the advanced multiple-choice setting (hatched bars in <xref ref-type="fig" rid="figure2">Figure 2B</xref>), a notable adverse effect was observed for the Claude Opus 4 model, whose accuracy declined significantly to 72.8% when using the out-of-domain basic RAG strategy. A converse pattern was observed in the basic multiple-choice setting (hatched bars in <xref ref-type="fig" rid="figure2">Figure 2A</xref>), where the introduction of out-of-domain advanced RAG context also led to a performance drop for some models, such as GPT-4o, declining to 63.2%.</p><p>The introduction of the hybrid RAG strategy directly addresses this domain alignment challenge. By having access to both knowledge bases, hybrid RAG simulates a more practical scenario. The results demonstrate its remarkable robustness. In the basic setting (<xref ref-type="fig" rid="figure2">Figure 2A</xref>), hybrid RAG&#x2019;s performance nearly matched that of the specialized basic RAG (eg, 95.2% vs 95.6% for GPT-4o in the multiple-choice modality). Similarly, in the advanced setting (<xref ref-type="fig" rid="figure2">Figure 2B</xref>), it remained highly competitive with the top-performing advanced RAG. This indicates that the hybrid RAG approach successfully navigates the knowledge selection challenge, maintaining high accuracy without requiring prior knowledge of the question&#x2019;s domain. It effectively mitigates the performance degradation seen with misaligned RAG, positioning it as a more generalizable and practical solution for real-world applications.</p></sec><sec id="s3-3"><title>Task-Specific Capability Profile: Uncovering True Failure Modes</title><p>To further dissect the LLM performance, we analyzed its accuracy across different clinical task categories (Figure S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The &#x201C;scaffolding effect&#x201D; of selection options was immediately apparent. A visual comparison between the multiple-choice modality (Figure S1A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and the open-ended modality (Figure S1B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) reveals a universal degradation of performance across nearly all tasks and LLMs. The radar plots in the open-ended setting are visibly &#x201C;shrunken,&#x201D; indicating that the performance drop observed in the overall accuracy is a pervasive phenomenon affecting all facets of clinical tasks.</p><p>Our task-specific analysis provides a granular capability profile of current LLMs, offering actionable insights for both biomedical researchers and clinicians.</p><p>For researchers, LLMs demonstrated considerable reliability on fact-retrieval&#x2013;oriented tasks, such as those in the &#x201C;Molecular and Genetic Basis&#x201D; category (Figure S1A in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This suggests their potential as powerful hypothesis generation assistants. For example, a researcher could rapidly query an LLM about the known molecular features of MSI tumors across different cancer types, accelerating literature review and experimental design.</p><p>For clinicians, the profile reveals a critical dichotomy. On one hand, LLMs can serve as reliable and rapid knowledge retrieval tools for well-established facts, such as those in &#x201C;Diagnosis and Testing.&#x201D; On the other hand, we pinpointed areas where clinicians must exercise extreme caution. A consistent pattern of underperformance was observed in complex, multifactor decision-making tasks, with &#x201C;Therapeutic Response (Chemotherapy)&#x201D; emerging as a prominent failure mode across all models in the challenging open-ended setting (Figure S1B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For instance, under the vanilla prompt, the accuracy of GPT-4o on this subtask (n=21) was only 33.3%, a performance level statistically indistinguishable from random chance in a high-stakes context. This striking weakness indicates that when faced with nuanced clinical scenarios that require weighing competing evidence, the internal knowledge of general-purpose LLMs is dangerously unreliable. This is the domain where unassisted LLM usage poses the greatest risk.</p><p>Crucially, our analysis also validates a clear pathway to mitigate these risks through a well-designed RAG system. However, the journey to effective RAG is nuanced. The erratic performance of basic RAG, which sourced knowledge only from general guidelines, highlights the danger of grounding models in incomplete or overly broad information; in some advanced tasks, it even underperformed non-RAG methods. In contrast, advanced-RAG, which used our specialized MSICKB knowledge base, consistently improved performance across most high-risk tasks. The most robust results were achieved with hybrid RAG, which integrated both knowledge sources, leading to substantial and stable performance gains, particularly in the most challenging decision-making areas such as chemotherapy response. This improvement was statistically significant for hybrid RAG with both GPT-4o (<italic>P</italic>&#x003C;.001; McNemar test) and Gemini 2.5 Pro (<italic>P</italic>=.02) compared to the vanilla strategy, although not for Claude Opus 4 (<italic>P</italic>=.22), reflecting its unique interaction with RAG systems.</p><p>This finding provides a clear directive: for high-stakes clinical decision support, RAG is not a monolithic solution but a framework whose effectiveness is contingent upon the quality and comprehensiveness of its knowledge base. A well-curated, specialized knowledge source is an essential safety and performance mechanism.</p></sec><sec id="s3-4"><title>AI Safety Analysis: RAG as a Key Enabler for LLM Honesty</title><p>Moving from performance to safety, we analyzed the composition of incorrect responses by categorizing them into 3 distinct behaviors: fabrication, justified refusal, and false refusal (<xref ref-type="fig" rid="figure3">Figure 3</xref>). Our analysis begins with the basic tier (<xref ref-type="fig" rid="figure3">Figures 3A and 3B</xref>), where a stark pattern emerged: nearly all incorrect responses were classified as fabrication. This outcome, however, is not an indicator of inherent model dishonesty but a direct consequence of the benchmark&#x2019;s design. The selection-based and true or false questions in this tier did not include a &#x201C;don&#x2019;t know&#x201D; option, structurally forcing the LLM to guess when faced with uncertainty and precluding any form of refusal.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Honesty analysis of error behaviors.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig03.png"/></fig><p>The models&#x2019; true behavioral tendencies are revealed in the advanced tier (<xref ref-type="fig" rid="figure3">Figures 3A and 3B</xref>), where expressing uncertainty is possible. Even here, non-RAG strategies (vanilla, CoT, and RoT) demonstrate a strong inclination toward fabrication, with rates often exceeding 85%. This highlights a critical safety concern, that is, without a bounded context, LLMs tend to guess rather than admit ignorance. In contrast, the introduction of RAG strategies fundamentally alters this behavior. Across all models and modalities, RAG dramatically reduces the proportion of fabrication by converting these high-risk errors into justified refusals. This shift represents a significant enhancement in model honesty, as the LLMs learn to correctly identify when the provided context is insufficient to form an answer.</p><p>However, this safety improvement comes at a price: the emergence of &#x201C;false refusals,&#x201D; a failure mode unique to RAG where the model refuses to answer despite the context containing the necessary information. This dynamic is most vividly illustrated by Claude Opus 4 under the basic RAG strategy in the multiple-choice setting (<xref ref-type="fig" rid="figure3">Figure 3A</xref>). Here, fabrication plummeted from 97.2% (vanilla) to just 35.2%. This reduction was primarily achieved by converting a remarkable 57.7% of errors into desirable justified refusals. The cost, however, was a 7% rate of false refusals. This pattern&#x2014;substituting dangerous fabrications with mostly safe justified refusals at the expense of some loss in utility&#x2014;is a consistent and critical finding of our RAG analysis.</p><p>This analysis demonstrates that RAG is a powerful mechanism for inducing epistemic caution. Grounding the model in a bounded context provides a clear basis for assessing knowledge limits, which is the mechanism behind justified refusals. While the existence of false refusals highlights a persistent challenge in ensuring models robustly use all provided evidence, the net effect represents a highly favorable shift in the error profile. RAG systematically replaces high-severity, factually incorrect errors (fabrication) with manageable, lower-severity utility errors (false refusals), thereby making the LLM&#x2019;s failure modes safer and more trustworthy.</p></sec><sec id="s3-5"><title>Error Analysis: Retrieval Failure as the New Bottleneck in RAG System</title><p>To understand the root causes of errors, we conducted a detailed error composition analysis, applying distinct taxonomies for non-RAG and RAG methods to precisely identify failure points (<xref ref-type="fig" rid="figure4">Figures 4A and 4B</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Error composition analysis of incorrect large language model (LLM) responses.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig04.png"/></fig><p>For methods relying on the LLM&#x2019;s internal knowledge (vanilla, CoT, and RoT), a clear pattern emerged: failures were overwhelmingly caused by an internal knowledge deficit. This confirms that for this specialized domain, the inherent knowledge gaps of general-purpose LLMs are the primary barrier to accuracy.</p><p>The introduction of RAG altered the error distribution. The proportion of internal knowledge deficit errors was substantially reduced across all RAG configurations, as the external knowledge base compensated for the models&#x2019; internal deficiencies.</p><p>However, this shift revealed that retrieval failure replaced knowledge deficit as the new dominant source of error. In most RAG scenarios, retrieval failure constituted the single largest component of incorrect responses, often accounting for over half of all errors. This indicates that while RAG addresses the problem of &#x201C;not knowing,&#x201D; it introduces the challenge of &#x201C;not finding,&#x201D; making system performance highly dependent on the accuracy of the retrieval module.</p><p>Other RAG-specific errors, such as context ignorance (the LLMs failing to use correct context) and reasoning errors, occurred at a lower frequency. This suggests that when the retriever succeeds in providing the correct context, the LLMs are generally capable of using it appropriately. Therefore, a key area for improving RAG performance in this domain is to enhance the retriever&#x2019;s ability to consistently deliver the correct context.</p></sec><sec id="s3-6"><title>Token Usage Analysis: The High Price of Complex Reasoning and the Efficiency of RAG</title><p>Finally, we evaluated the token usage of each strategy by measuring the total token count (input and output) for each interaction (<xref ref-type="fig" rid="figure5">Figures 5</xref> and <xref ref-type="fig" rid="figure6">6</xref>).</p><p>The most notable finding, illustrated by the average token counts in <xref ref-type="fig" rid="figure5">Figure 5</xref>, was the substantial token usage associated with complex reasoning prompts. The RoT strategy consistently consumed the most tokens across all conditions. This effect was particularly pronounced for Gemini 2.5 Pro in the multiple-choice&#x2013;Basic setting (<xref ref-type="fig" rid="figure5">Figure 5A</xref>), where the average total tokens for RoT (1949 tokens) were nearly 2 times that of CoT (1044 tokens) and almost 4 times that of vanilla (560 tokens). This highlights the significant computational overhead required for multipersona ensemble reasoning.</p><p>The analysis also revealed a clear difference between the 2 evaluation modalities. As shown in both the average counts (<xref ref-type="fig" rid="figure5">Figure 5B</xref>) and the distributions (<xref ref-type="fig" rid="figure6">Figure 6B</xref>), interactions in the open-ended modality were generally more token-intensive. More importantly, the box plots in <xref ref-type="fig" rid="figure6">Figure 6B</xref> show that these open-ended interactions exhibited significantly greater variance (larger IQRs and more outliers) compared to their multiple-choice counterparts (<xref ref-type="fig" rid="figure6">Figure 6A</xref>). This indicates that open-ended questions not only require a higher average total token usage but also lead to less predictable and more variable costs.</p><p>In terms of efficiency, the family of RAG strategies demonstrated a superior balance of high accuracy and token economy. Crucially, our analysis accounts for the full cost, including retrieved context in the input tokens. While achieving high accuracy (as shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>), their average token counts (<xref ref-type="fig" rid="figure5">Figure 5</xref>) remained significantly lower than RoT and often comparable to CoT. This highlights a fundamental difference in cost structure: the expense of RoT is primarily in the generated output (ie, computational reasoning), whereas a significant portion of RAG&#x2019;s cost is front-loaded into the input prompt (ie, providing context). Furthermore, the RAG methods offer a spectrum of cost-performance options: hybrid RAG, which integrates CoT reasoning, logically incurs a higher token cost than basic or advanced RAG, positioning it as a premium option for when maximal accuracy is required. This positions the RAG framework as a strategically efficient approach for deploying reliable clinical AI systems. It allows developers to balance computational cost and performance requirements by selecting the appropriate RAG configuration&#x2014;from token-frugal methods for simpler tasks to more powerful, context-rich options such as hybrid RAG for high-stakes clinical decisions.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Average total token usage per interaction on the Microsatellite Instability Cancer Benchmark (MSIC-Bench).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig05.png"/></fig><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Distribution of total token usage per interaction on the Microsatellite Instability Cancer Benchmark (MSIC-Bench).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig06.png"/></fig></sec><sec id="s3-7"><title>Performance Analysis Relative to Knowledge Cutoff Date</title><p>To investigate the influence of training data currency on model performance, we conducted an analysis on the &#x201C;advanced&#x201D; tier using GPT-4o, comparing its performance on questions from precutoff vs postcutoff sources (<xref ref-type="fig" rid="figure7">Figure 7</xref>).</p><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>GPT-4o&#x2019;s (OpenAI) accuracy on questions from before- and aftercutoff sources.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88614_fig07.png"/></fig><p>For strategies relying on internal knowledge (vanilla, CoT, and RoT), performance was consistently higher on the precutoff data (eg, 83.68% vs 76.32% for vanilla). This indicates a dependency on the model&#x2019;s training history for these non-RAG methods.</p><p>The analysis of RAG-based strategies revealed that performance is critically dependent on knowledge source alignment. A striking example is basic-RAG, where providing out-of-domain guideline knowledge to answer advanced questions led to a sharp performance drop on postcutoff data (55.26%), the lowest of any strategy. This highlights the risk that a misaligned RAG can be more detrimental than no RAG at all.</p><p>In contrast, when the knowledge source was correctly aligned, RAG&#x2019;s effectiveness became evident. The advanced RAG strategy achieved the highest accuracy on postcutoff questions (97.37%), demonstrating the model&#x2019;s strong ability to reason over novel information when the correct context is supplied. Most notably, hybrid RAG maintained high and stable accuracy across both precutoff (91.32%) and postcutoff (89.47%) data, indicating its robustness.</p><p>These findings confirm that while non-RAG strategies are sensitive to training data currency, a RAG architecture&#x2019;s reliability is primarily determined by its knowledge base. A comprehensive and well-aligned knowledge source, as demonstrated by a hybrid RAG, allows the system to perform reliably, independent of the model&#x2019;s internal knowledge cutoff.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>The increasing application of LLMs in medicine raises a pivotal question regarding their reliability in knowledge-intensive subspecialties such as MSI cancers, a domain where no gold-standard evaluation benchmark existed at the time of our analysis. To address this challenge, we developed MSIC-Bench. This benchmark features a two-tiered structure based on clinical guidelines (eg, NCCN and ESMO) and a curated, evidence-based knowledge base (MSICKB), allowing for separate assessment of an LLM&#x2019;s performance on both consensus and nuanced scientific knowledge. Using this framework, we evaluated 3 leading LLMs across 4 prompting strategies, analyzing their accuracy, safety, error composition, and token usage.</p><p>Our findings reveal a significant &#x201C;scaffolding effect,&#x201D; where selection-based formats inflate accuracy compared to more realistic open-ended scenarios. This performance drop highlighted specific failure modes, notably in chemotherapy-related decisions. To address these failures, RAG proved to be a critical intervention. It not only substantially improves accuracy but also fundamentally shifts the error profile from high-risk &#x201C;fabrications&#x201D; to safer &#x201C;I don&#x2019;t know&#x201D; responses. Finally, our analysis highlights key complexities for real-world applications: model-specific vulnerabilities, the importance of knowledge-to-task alignment in RAG, and the high token usage of certain prompting strategies.</p><p>Our evaluation of MSIC-Bench yields 3 principal insights. First, we identified a pronounced &#x201C;scaffolding effect&#x201D;: LLMs perform substantially better on multiple-choice than on open-ended questions, indicating that widely used multiple-choice benchmarks likely overestimate their real-world clinical capabilities and that honest behavior cannot be assessed unless the task design explicitly allows refusals. Second, error composition analysis shows that in this specialized domain, the dominant failure mode of non-RAG strategies is internal knowledge deficit, meaning that better reasoning prompts alone cannot compensate for missing domain knowledge; with RAG, this bottleneck shifts from &#x201C;knowing&#x201D; to &#x201C;finding,&#x201D; as retrieval failure becomes the primary source of error. Third, RAG emerges as the key intervention for deploying LLMs in MSI oncology, with performance and safety tightly coupled to knowledge-base quality: domain-aligned and comprehensive resources (as in advanced and hybrid RAG) markedly improve accuracy and convert high-risk fabrications into safer refusals, but also introduce new dependencies on retrieval precision and computational cost, making optimization of the retrieval module and careful curation of knowledge sources (eg, through the use of structured methodologies such as ontologies [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>] and curated knowledge bases [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]) central to future clinical AI development.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Our findings are consistent with and extend the growing body of literature on LLM evaluation in specialized domains. For instance, the performance gap we identified between multiple-choice and open-ended modalities (the &#x201C;scaffolding effect&#x201D;) aligns with similar discrepancies observed by Song et al [<xref ref-type="bibr" rid="ref13">13</xref>] in their study on RAG, confirming that evaluation formats significantly impact perceived model capabilities in clinical contexts.</p><p>Furthermore, our findings contribute to the continuing discussion about the roles of reasoning vs knowledge in LLM performance. Our conclusion regarding the limited efficacy of reasoning prompts such as CoT in knowledge-intensive tasks is corroborated by prior work. For instance, prior work [<xref ref-type="bibr" rid="ref39">39</xref>] similarly found that applying CoT in the knowledge-driven domain of rare disease diagnosis could paradoxically decrease accuracy. Our study provides a clear mechanistic explanation for this phenomenon. By using error composition analysis, we demonstrate that when foundational knowledge is absent&#x2014;as is often the case in specialized medicine&#x2014;the primary failure mode is internal knowledge deficit. In such scenarios, structured reasoning prompts cannot compensate for the core knowledge void.</p><p>Crucially, our work extends this understanding by showing how RAG transforms the problem. We demonstrate that by supplying external knowledge, RAG effectively eliminates internal knowledge deficit, but in doing so, it shifts the primary bottleneck to retrieval failure. This finding refines the distinction between reasoning-intensive and knowledge-intensive tasks, suggesting that for RAG-based systems, retrieval-intensity emerges as a new, critical dimension for analysis and optimization.</p></sec><sec id="s4-3"><title>Limitations and Future Directions</title><p>Our study has several limitations that also suggest directions for future work. First, MSIC-Bench is limited to a single clinical problem and reflects only a static snapshot of knowledge related to MSI-associated cancers, which restricts its breadth of coverage. Future work should broaden and update the benchmark within MSI cancers by adding more diverse questions, incorporating real-world case-based scenarios, and regularly revising content to reflect evolving evidence and model capabilities. Second, the benchmark is text-only, whereas real-world decision-making is multimodal. An important next step is the development of benchmarks that integrate images, genomic data, and electronic health records. Third, our question-answering format simplifies clinical reasoning, which is often iterative and conversational; future evaluations should include more realistic case-based and dialogue-style tasks.</p><p>Fourth, our study used a foundational RAG implementation. Our error analysis identified retrieval failure as the new primary bottleneck, which points to a clear and critical direction for future work. Research should prioritize the investigation of more sophisticated RAG techniques, such as iterative retrieval and query rewriting, to directly address this retrieval challenge. Fifth, our initial RAG evaluation used a &#x201C;split-index&#x201D; (oracle routing) strategy. While this design allowed us to isolate the impact of knowledge source quality, we acknowledge that it does not fully reflect real-world scenarios. Our introduction of the hybrid-RAG model was a first step to address this, but future work should focus on developing more sophisticated dynamic routing mechanisms. Sixth, the same static prompt template was applied across all RAG settings. In the advanced and hybrid RAG strategies, the phrase &#x201C;clinical guideline context&#x201D; may not have fully matched the nature of the retrieved MSICKB-derived evidence, which could have influenced how the model interpreted the provided context. Future work should explore source-aware prompt designs that better align the instruction wording with the type of retrieved evidence. Seventh, our analysis revealed a critical trade-off between safety and utility in RAG systems. While RAG effectively curbed dangerous &#x201C;fabrications,&#x201D; it also introduced &#x201C;false refusals&#x201D;&#x2014;a failure to answer otherwise answerable questions that limits clinical utility. This highlights that the goal for future research is not simply to maximize refusals for safety, but to find an optimal balance. Future work should therefore focus on strategies to minimize false refusals without reintroducing a tendency to fabricate.</p><p>Eighth, our analysis of parametric memory was constrained by model cutoff dates, preventing a direct before-and-after comparison for Claude and Gemini. However, our conclusion that &#x201C;seeing is not knowing&#x201D; is supported by strong indirect evidence: our error composition analysis (<xref ref-type="fig" rid="figure4">Figure 4</xref>) shows that internal knowledge deficit remained the dominant non-RAG failure mode for all models. This finding is consistent with multiple studies showing that LLMs often fail to reliably recall information, even when it is present in their training data, a phenomenon particularly pronounced in specialized, long-tail knowledge domains such as MSI cancers [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>]. Ninth, as the MSICKB knowledge base and the &#x201C;advanced&#x201D; tier questions were developed by the same research group, a discussion of potential circular validation is warranted. To mitigate this risk, every ground truth answer is strictly grounded in and explicitly linked to its source peer-reviewed publication via a PubMed ID. However, more robust external validation remains an important direction for our future work. Our primary planned next step is to collect and annotate real-world clinical data to create an independent test set for validating our benchmark and retrieval framework. We also note that this specific challenge of circular validation, where the retrieval corpus overlaps with the benchmark&#x2019;s source material, is a widely recognized issue in the evaluation of specialized RAG systems [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Additionally, our token usage estimates relied on a single tokenizer (tiktoken), which only approximates tokenization behavior for non-OpenAI models. Future work should use model-specific tokenizers to obtain more accurate cost estimates. Finally, because LLMs evolve rapidly, our results capture only a single time point, emphasizing the need for continuous and longitudinal benchmarking in clinical AI.</p></sec><sec id="s4-4"><title>Conclusion</title><p>In this study, we addressed the critical absence of a standardized evaluation tool for LLMs in MSI cancers by developing and applying MSIC-Bench. Our benchmark revealed that the primary limiter of current LLMs is a profound deficit in domain-specific knowledge. Our results demonstrate that a well-designed RAG architecture is the pivotal intervention to address this knowledge gap. However, it is not a simple fix. We found that RAG transforms the system&#x2019;s failure modes, shifting the primary bottleneck from knowledge deficits to retrieval failures. This insight establishes a core principle for the development of trustworthy clinical AI: the quality and comprehensiveness of the knowledge base and the precision of the retrieval system are the most critical components. Ultimately, enhancing clinical RAG systems requires a dual focus on optimizing retrieval mechanisms and improving the model&#x2019;s contextual reasoning. MSIC-Bench provides not only a comprehensive capability profile of LLMs in this domain but also a replicable methodology to guide this future development, ensuring that knowledge-augmented AI assistants can be validated and safely integrated into specialized medical fields.</p></sec></sec></body><back><ack><p>We extend our sincere gratitude to the 2 clinical experts from a major academic medical center in China who provided invaluable contributions to this study. Their expertise was essential for the benchmark's development and the manual evaluation of the large language model (LLM) outputs. Detailed, anonymized qualifications for these experts are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Generative artificial intelligence (Gemini 2.5 Pro; Google) was used to support linguistic refinement and coherence in the paper text. All conceptual contributions, interpretations, and substantive arguments presented in this paper are solely those of the authors.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the National Natural Science Foundation of China (grant nos 32270690 and 32570773) and the Sichuan Province Science and Technology Program (grant no 2024YFHZ0205). The funder played no role in the study design, data collection, analysis, and interpretation of data, or paper writing.</p></sec><sec><title>Data Availability</title><p>The Microsatellite Instability Cancer Benchmark (MSIC-Bench) benchmark data, analysis scripts, and additional materials are openly available in a GitHub repository [<xref ref-type="bibr" rid="ref44">44</xref>]. Specifically, the exact versions of the National Comprehensive Cancer Network (NCCN) and European Society for Medical Oncology (ESMO) guidelines analyzed in this study are permanently archived and can be directly accessed in the Knowledge_Sources folder. The full outputs generated during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JS, YZ, and BS contributed to conceptualization. YZ and JS contributed to data curation. YZ, JS, and CB conducted the formal analysis. BS acquired funding. YZ, JS, and ZX conducted the investigation. YZ, JS, and CB developed the methodology. BS led project administration and supervision. JS, YZ, and ZX contributed to software development. YZ and JS performed validation. YZ, JS, and XZ contributed to visualization. YZ and JS prepared the original draft. YZ, JS, BS, XZ, and DC reviewed and edited the paper.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">CoT</term><def><p>chain-of-thought</p></def></def-item><def-item><term id="abb4">ESMO</term><def><p>European Society for Medical Oncology</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">MSI</term><def><p>microsatellite instability</p></def></def-item><def-item><term id="abb7">MSIC-Bench</term><def><p>Microsatellite Instability Cancer Benchmark</p></def></def-item><def-item><term id="abb8">MSICKB</term><def><p>Microsatellite Instability Cancer Knowledgebase</p></def></def-item><def-item><term id="abb9">NCCN</term><def><p>National Comprehensive Cancer Network</p></def></def-item><def-item><term id="abb10">RAG</term><def><p>retrieval-augmented generation</p></def></def-item><def-item><term id="abb11">RoT</term><def><p>reflection of thoughts</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hanahan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Weinberg</surname><given-names>RA</given-names> </name></person-group><article-title>Hallmarks of cancer: the next generation</article-title><source>Cell</source><year>2011</year><month>03</month><day>4</day><volume>144</volume><issue>5</issue><fpage>646</fpage><lpage>674</lpage><pub-id pub-id-type="doi">10.1016/j.cell.2011.02.013</pub-id><pub-id pub-id-type="medline">21376230</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>L&#x00F3;pez-Ot&#x00ED;n</surname><given-names>C</given-names> </name><name name-style="western"><surname>Blasco</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Partridge</surname><given-names>L</given-names> </name><name name-style="western"><surname>Serrano</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kroemer</surname><given-names>G</given-names> </name></person-group><article-title>The hallmarks of aging</article-title><source>Cell</source><year>2013</year><month>06</month><day>6</day><volume>153</volume><issue>6</issue><fpage>1194</fpage><lpage>1217</lpage><pub-id pub-id-type="doi">10.1016/j.cell.2013.05.039</pub-id><pub-id pub-id-type="medline">23746838</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marcus</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lemery</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Keegan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Pazdur</surname><given-names>R</given-names> </name></person-group><article-title>FDA approval summary: pembrolizumab for the treatment of microsatellite instability-high solid tumors</article-title><source>Clin Cancer Res</source><year>2019</year><month>07</month><day>1</day><volume>25</volume><issue>13</issue><fpage>3753</fpage><lpage>3758</lpage><pub-id pub-id-type="doi">10.1158/1078-0432.CCR-18-4070</pub-id><pub-id pub-id-type="medline">30787022</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nojadeh</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Behrouz Sharif</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sakhinia</surname><given-names>E</given-names> </name></person-group><article-title>Microsatellite instability in colorectal cancer</article-title><source>EXCLI J</source><year>2018</year><volume>17</volume><fpage>159</fpage><lpage>168</lpage><pub-id pub-id-type="doi">10.17179/excli2017-948</pub-id><pub-id pub-id-type="medline">29743854</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Luchini</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bibeau</surname><given-names>F</given-names> </name><name name-style="western"><surname>Ligtenberg</surname><given-names>MJL</given-names> </name><etal/></person-group><article-title>ESMO recommendations on microsatellite instability testing for immunotherapy in cancer, and its relationship with PD-1/PD-L1 expression and tumour mutational burden: a systematic review-based approach</article-title><source>Ann Oncol</source><year>2019</year><month>08</month><day>1</day><volume>30</volume><issue>8</issue><fpage>1232</fpage><lpage>1243</lpage><pub-id pub-id-type="doi">10.1093/annonc/mdz116</pub-id><pub-id pub-id-type="medline">31056702</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sahin</surname><given-names>IH</given-names> </name><name name-style="western"><surname>Akce</surname><given-names>M</given-names> </name><name name-style="western"><surname>Alese</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Immune checkpoint inhibitors for the treatment of MSI-H/MMR-D colorectal cancer and a perspective on resistance mechanisms</article-title><source>Br J Cancer</source><year>2019</year><month>11</month><volume>121</volume><issue>10</issue><fpage>809</fpage><lpage>818</lpage><pub-id pub-id-type="doi">10.1038/s41416-019-0599-y</pub-id><pub-id pub-id-type="medline">31607751</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kather</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Halama</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Deep learning can predict microsatellite instability directly from histology in gastrointestinal cancer</article-title><source>Nat Med</source><year>2019</year><month>07</month><volume>25</volume><issue>7</issue><fpage>1054</fpage><lpage>1056</lpage><pub-id pub-id-type="doi">10.1038/s41591-019-0462-y</pub-id><pub-id pub-id-type="medline">31160815</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name></person-group><article-title>Artificial intelligence for prediction of response to cancer immunotherapy</article-title><source>Semin Cancer Biol</source><year>2022</year><month>12</month><volume>87</volume><fpage>137</fpage><lpage>147</lpage><pub-id pub-id-type="doi">10.1016/j.semcancer.2022.11.008</pub-id><pub-id pub-id-type="medline">36372326</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bai</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Ethakota</surname><given-names>J</given-names> </name><etal/></person-group><article-title>The role of artificial intelligence in colorectal cancer and polyp detection: a systematic review</article-title><source>JCO</source><year>2025</year><month>02</month><volume>43</volume><issue>4_suppl</issue><fpage>47</fpage><lpage>47</lpage><pub-id pub-id-type="doi">10.1200/JCO.2025.43.4_suppl.47</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>GPT-4o</article-title><source>OpenAI Developers</source><access-date>2026-02-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.openai.com/docs/models/gpt-4o">https://platform.openai.com/docs/models/gpt-4o</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Claude platform</article-title><source>Claude API Docs</source><access-date>2026-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://platform.claude.com/docs/en/release-notes/overview#may-22-2025">https://platform.claude.com/docs/en/release-notes/overview#may-22-2025</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><article-title>Gemini 3</article-title><source>Gemini API</source><access-date>2026-02-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://ai.google.dev/gemini-api/docs/models">https://ai.google.dev/gemini-api/docs/models</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>M</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>B</given-names> </name></person-group><article-title>Graph retrieval augmented large language models for facial phenotype associated rare genetic disease</article-title><source>NPJ Digit Med</source><year>2025</year><month>08</month><day>24</day><volume>8</volume><issue>1</issue><fpage>543</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01955-x</pub-id><pub-id pub-id-type="medline">40849403</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>A knowledge-enhanced platform (MetaSepsisKnowHub) for retrieval augmented generation-based sepsis heterogeneity and personalized management: development study</article-title><source>J Med Internet Res</source><year>2025</year><month>06</month><day>6</day><volume>27</volume><fpage>e67201</fpage><pub-id pub-id-type="doi">10.2196/67201</pub-id><pub-id pub-id-type="medline">40478618</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Exploring the potential of artificial intelligence to enhance the writing of English academic papers by non-native English-speaking medical students - the educational application of ChatGPT</article-title><source>BMC Med Educ</source><year>2024</year><month>07</month><day>9</day><volume>24</volume><issue>1</issue><fpage>736</fpage><pub-id pub-id-type="doi">10.1186/s12909-024-05738-y</pub-id><pub-id pub-id-type="medline">38982429</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>MSICKB: a curated knowledgebase for exploring molecular heterogeneity and biomarker prioritization in microsatellite instability cancers</article-title><source>Comput Struct Biotechnol J</source><year>2026</year><volume>35</volume><issue>1</issue><fpage>0047</fpage><pub-id pub-id-type="doi">10.34133/csbj.0047</pub-id><pub-id pub-id-type="medline">42017048</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 28, 2022</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2201.11903</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Prompt engineering in consistency and reliability with the evidence-based guideline for LLMs</article-title><source>NPJ Digit Med</source><year>2024</year><month>02</month><day>20</day><volume>7</volume><issue>1</issue><fpage>41</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01029-4</pub-id><pub-id pub-id-type="medline">38378899</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Perez</surname><given-names>E</given-names> </name><name name-style="western"><surname>Piktus</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Retrieval-augmented generation for knowledge-intensive NLP tasks</article-title><source>arXiv</source><comment>Preprint posted online on  May 22, 2020</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2005.11401</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><article-title>Colon cancer</article-title><source>National Comprehensive Cancer Network</source><access-date>2026-02-02</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nccn.org/guidelines/guidelines-detail?category=1&#x0026;id=1428">https://www.nccn.org/guidelines/guidelines-detail?category=1&#x0026;id=1428</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Colorectal cancer screening</article-title><source>National Comprehensive Cancer Network</source><access-date>2026-02-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nccn.org/guidelines/guidelines-detail?category=1&#x0026;id=1429">https://www.nccn.org/guidelines/guidelines-detail?category=1&#x0026;id=1429</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ajani</surname><given-names>JA</given-names> </name><name name-style="western"><surname>D&#x2019;Amico</surname><given-names>TA</given-names> </name><name name-style="western"><surname>Bentrem</surname><given-names>DJ</given-names> </name><etal/></person-group><article-title>Gastric cancer, version 2.2025, NCCN Clinical Practice Guidelines In Oncology</article-title><source>J Natl Compr Canc Netw</source><year>2025</year><month>05</month><volume>23</volume><issue>5</issue><fpage>169</fpage><lpage>191</lpage><pub-id pub-id-type="doi">10.6004/jnccn.2025.0022</pub-id><pub-id pub-id-type="medline">40341199</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><article-title>Rectal cancer</article-title><source>National Comprehensive Cancer Network</source><access-date>2026-02-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nccn.org/guidelines/guidelines-detail?category=1&#x0026;id=1461">https://www.nccn.org/guidelines/guidelines-detail?category=1&#x0026;id=1461</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abu-Rustum</surname><given-names>NR</given-names> </name><name name-style="western"><surname>Campos</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Amarnath</surname><given-names>S</given-names> </name><etal/></person-group><article-title>NCCN Guidelines&#x00AE; insights: uterine neoplasms, version 3.2025</article-title><source>J Natl Compr Canc Netw</source><year>2025</year><month>08</month><volume>23</volume><issue>8</issue><fpage>284</fpage><lpage>291</lpage><pub-id pub-id-type="doi">10.6004/jnccn.2025.0038</pub-id><pub-id pub-id-type="medline">40763788</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oaknin</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bosse</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Creutzberg</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Endometrial cancer: ESMO Clinical Practice Guideline for diagnosis, treatment and follow-up</article-title><source>Ann Oncol</source><year>2022</year><month>09</month><volume>33</volume><issue>9</issue><fpage>860</fpage><lpage>877</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2022.05.009</pub-id><pub-id pub-id-type="medline">35690222</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lordick</surname><given-names>F</given-names> </name><name name-style="western"><surname>Carneiro</surname><given-names>F</given-names> </name><name name-style="western"><surname>Cascinu</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Gastric cancer: ESMO Clinical Practice Guideline for diagnosis, treatment and follow-up</article-title><source>Ann Oncol</source><year>2022</year><month>10</month><volume>33</volume><issue>10</issue><fpage>1005</fpage><lpage>1020</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2022.07.004</pub-id><pub-id pub-id-type="medline">35914639</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Argil&#x00E9;s</surname><given-names>G</given-names> </name><name name-style="western"><surname>Tabernero</surname><given-names>J</given-names> </name><name name-style="western"><surname>Labianca</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Localised colon cancer: ESMO Clinical Practice Guidelines for diagnosis, treatment and follow-up</article-title><source>Ann Oncol</source><year>2020</year><month>10</month><volume>31</volume><issue>10</issue><fpage>1291</fpage><lpage>1305</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2020.06.022</pub-id><pub-id pub-id-type="medline">32702383</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hofheinz</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Fokas</surname><given-names>E</given-names> </name><name name-style="western"><surname>Benhaim</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Localised rectal cancer: ESMO Clinical Practice Guideline for diagnosis, treatment and follow-up</article-title><source>Ann Oncol</source><year>2025</year><month>09</month><volume>36</volume><issue>9</issue><fpage>1007</fpage><lpage>1024</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2025.05.528</pub-id><pub-id pub-id-type="medline">40412553</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cervantes</surname><given-names>A</given-names> </name><name name-style="western"><surname>Adam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rosell&#x00F3;</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Metastatic colorectal cancer: ESMO Clinical Practice Guideline for diagnosis, treatment and follow-up</article-title><source>Ann Oncol</source><year>2023</year><month>01</month><volume>34</volume><issue>1</issue><fpage>10</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2022.10.003</pub-id><pub-id pub-id-type="medline">36307056</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>Cancer Genome Atlas Research Network</collab><name name-style="western"><surname>Weinstein</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Collisson</surname><given-names>EA</given-names> </name><etal/></person-group><article-title>The Cancer Genome Atlas pan-cancer analysis project</article-title><source>Nat Genet</source><year>2013</year><month>10</month><volume>45</volume><issue>10</issue><fpage>1113</fpage><lpage>1120</lpage><pub-id pub-id-type="doi">10.1038/ng.2764</pub-id><pub-id pub-id-type="medline">24071849</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>Model cards</article-title><source>Google DeepMind</source><access-date>2026-02-04</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://deepmind.google/models/model-cards/">https://deepmind.google/models/model-cards/</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>openai / tiktoken</article-title><source>GitHub</source><year>2025</year><access-date>2025-11-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/openai/tiktoken">https://github.com/openai/tiktoken</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zong</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Core reference ontology for individualized exercise prescription</article-title><source>Sci Data</source><year>2024</year><month>12</month><day>18</day><volume>11</volume><issue>1</issue><fpage>1349</fpage><pub-id pub-id-type="doi">10.1038/s41597-024-04217-9</pub-id><pub-id pub-id-type="medline">39695140</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>PCAO2: an ontology for integration of prostate cancer associated genotypic, phenotypic and lifestyle data</article-title><source>Brief Bioinform</source><year>2024</year><month>03</month><day>27</day><volume>25</volume><issue>3</issue><fpage>bbae136</fpage><pub-id pub-id-type="doi">10.1093/bib/bbae136</pub-id><pub-id pub-id-type="medline">38557678</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>NDDRF 2.0: an update and expansion of risk factor knowledge base for personalized prevention of neurodegenerative diseases</article-title><source>Alzheimers Dement</source><year>2025</year><month>05</month><volume>21</volume><issue>5</issue><fpage>e70282</fpage><pub-id pub-id-type="doi">10.1002/alz.70282</pub-id><pub-id pub-id-type="medline">40371632</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>X</given-names> </name><name name-style="western"><surname>Bi</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bo</surname><given-names>W</given-names> </name><etal/></person-group><article-title>DRPMKB1.0: a comprehensive knowledge base for an AI-oriented drug repositioning prediction model</article-title><source>J Chem Inf Model</source><year>2026</year><month>01</month><day>12</day><volume>66</volume><issue>1</issue><fpage>122</fpage><lpage>137</lpage><pub-id pub-id-type="doi">10.1021/acs.jcim.5c01945</pub-id><pub-id pub-id-type="medline">41474981</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>F</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>CoT-RAG: integrating chain of thought and retrieval-augmented generation to enhance reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 18, 2025</comment><pub-id pub-id-type="doi">10.18653/v1/2025.findings-emnlp.168</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Evaluating the external and parametric knowledge fusion of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  May 29, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2405.19010</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kandpal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>H</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>E</given-names> </name><name name-style="western"><surname>Raffel</surname><given-names>C</given-names> </name></person-group><article-title>Large language models struggle to learn long-tail knowledge</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 15, 2022</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2211.08411</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Shen</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Are we on the right way for assessing document retrieval-augmented generation?</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 5, 2025</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2508.03644</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Masanneck</surname><given-names>L</given-names> </name><name name-style="western"><surname>Meuth</surname><given-names>SG</given-names> </name><name name-style="western"><surname>Pawlitzki</surname><given-names>M</given-names> </name></person-group><article-title>Evaluating base and retrieval augmented LLMs with document or online support for evidence based neurology</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>4</day><volume>8</volume><issue>1</issue><fpage>137</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01536-y</pub-id><pub-id pub-id-type="medline">40038423</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="web"><article-title>ExplorerZhyuxin/MSIC-benchmark</article-title><source>GitHub</source><access-date>2026-02-26</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/ExplorerZhyuxin/MSIC-Benchmark/tree/main">https://github.com/ExplorerZhyuxin/MSIC-Benchmark/tree/main</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Key methodological details&#x2014;including full system prompts, model specifications, and error analysis taxonomies&#x2014;to ensure transparency and reproducibility of the main study.</p><media xlink:href="jmir_v28i1e88614_app1.docx" xlink:title="DOCX File, 5508 KB"/></supplementary-material></app-group></back></article>