<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e89862</article-id><article-id pub-id-type="doi">10.2196/89862</article-id><article-categories><subj-group subj-group-type="heading"><subject>Review</subject></subj-group></article-categories><title-group><article-title>Large Language Models in Colorectal Cancer Care and Clinical Decision Support: Systematic Review</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Tian</surname><given-names>Jinglei</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lou</surname><given-names>Qifeng</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Xue</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Hangying</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mei</surname><given-names>Huiting</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yu</surname><given-names>Yanli</given-names></name><degrees>BS</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Zhejiang Chinese Medical University</institution><addr-line>Zhejiang Chinese Medical University, Hangzhou, China</addr-line><addr-line>Hangzhou</addr-line><addr-line>Zhejiang</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Gastroenterology, Hangzhou First People's Hospital</institution><addr-line>Hangzhou, China</addr-line><addr-line>Hangzhou</addr-line><addr-line>Zhejiang</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Brini</surname><given-names>Stefano</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Sagkriotis</surname><given-names>Alexandros</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Tosun</surname><given-names>Ilker</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Qifeng Lou, MSc, Department of Gastroenterology, Hangzhou First People's Hospital, Hangzhou, China, Hangzhou, Zhejiang, China, 86 15267498545; <email>louqifeng0503@163.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>21</day><month>5</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e89862</elocation-id><history><date date-type="received"><day>22</day><month>12</month><year>2025</year></date><date date-type="rev-recd"><day>26</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>27</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Jinglei Tian, Qifeng Lou, Xue Wang, Hangying Xv, Huiting Mei, Yanli Yu. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 21.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e89862"/><abstract><sec><title>Background</title><p>Colorectal cancer (CRC) is a leading cause of cancer morbidity and mortality worldwide. The complexity of guideline-concordant care and unstructured clinical data has driven demand for decision-support tools. Large language models (LLMs) show promise for processing clinical data and patient&#x2013;provider communication, yet evidence is fragmented, and a CRC-specific synthesis across the full care continuum is lacking.</p></sec><sec><title>Objective</title><p>This systematic review evaluates the current applications, performance determinants, and clinical implications of LLMs across the continuum of CRC care.</p></sec><sec sec-type="methods"><title>Methods</title><p>Following PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses), we searched 6 databases (PubMed, Embase, Web of Science, Scopus, CINAHL, Cochrane) through April 1, 2026. Eligible studies were peer-reviewed original investigations of LLMs on CRC tasks with extractable outcomes; reviews, editorials, and abstracts were excluded. Two reviewers assessed quality with QUADAS-2 (Quality Assessment of Diagnostic Accuracy Studies-2), PROBAST (prediction model risk of bias assessment tool), and ROBINS-I (Risk of Bias in Nonrandomized Studies - of Interventions). Data on model types, applications, prompts, input/output formats, and outcomes were analyzed descriptively, with narrative synthesis per synthesis without meta-analysis (SWiM) guidelines.</p></sec><sec sec-type="results"><title>Results</title><p>Of 8880 records, 37 studies met inclusion criteria (2023&#x2010;2026), mostly from China and the United States, with GPT series most frequently evaluated. Overall risk of bias was low in 10/37 studies (27.0%), moderate in 14/37 (37.8%), unclear in 7/37 (18.9%), and high or serious in 6/37 (16.2%). Problematic domains included outcome measurement, intervention classification, patient selection, and lack of blinded assessment. LLMs showed utility in automating data extraction from clinical texts, supporting patient education, aiding diagnosis, and assisting clinical decision-making, with emerging visual interpretation and multimodal capacities. Domain-specific and multimodal models showed advantages over general-purpose models in certain tasks. Performance was significantly influenced by prompt design, from zero-shot queries to fine-tuning. Despite efficiency and outcome benefits, challenges persist regarding methodological quality, data privacy, and generalizability.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This review provides an integrative framework synthesizing evidence across study designs and LLM categories in CRC care. Unlike prior reviews addressing gastroenterology broadly or limited to one design, it covers the full CRC continuum and, for the first time, comparatively evaluates general-purpose, domain-specific, and multimodal LLMs, clarifying how prompt engineering and heterogeneous metrics shape outcomes. Although findings support LLMs&#x2019; clinical potential, results must be interpreted cautiously, given low overall evidence quality. Most studies lacked safeguards against bias&#x2014;blinded assessment, confounder adjustment, or prospective multicenter validation. Substantial heterogeneity across tasks, LLM types, prompts, reference standards, and outcomes means reported advantages cannot be generalized. Future work should prioritize real-world integration via prospective multicenter validation, robust privacy frameworks, and rigorous human oversight. Amid rising global CRC burden and health care disparities, this review informs clinical translation, equitable scaling, and policy on LLM deployment.</p></sec><sec><title>Trial Registration</title><p>PROSPERO CRD420251248261; https://www.crd.york.ac.uk/PROSPERO/view/CRD420251248261</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>large language models</kwd><kwd>colorectal cancer</kwd><kwd>gastroenterology</kwd><kwd>systematic review</kwd><kwd>PRISMA</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Colorectal cancer (CRC) is the third most commonly diagnosed malignancy and the second leading cause of cancer-related mortality worldwide, with incidence projected to rise substantially through 2050 [<xref ref-type="bibr" rid="ref1">1</xref>]. Contemporary CRC care spans a long continuum: risk stratification, screening, endoscopic and histopathological diagnosis, multidisciplinary treatment, and long-term surveillance, in which each stage generates dense, largely unstructured clinical text and requires time-sensitive, guideline-concordant decisions [<xref ref-type="bibr" rid="ref2">2</xref>]. This labor-intensive process is time-consuming and error-prone due to visual fatigue and information gaps inherent in voluminous clinical notes, pulling clinicians from direct patient care and straining both providers and institutional resources [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Within this context, large language models (LLMs) built on the Transformer architecture have emerged as a candidate interface between complex clinical text and decision support [<xref ref-type="bibr" rid="ref5">5</xref>]. Compared with conventional clinical decision-support and patient education modalities, LLMs offer several distinct advantages: automated extraction and processing of large-scale clinical follow-up records [<xref ref-type="bibr" rid="ref6">6</xref>], real-time responses to patient inquiries regarding CRC symptoms and prevention [<xref ref-type="bibr" rid="ref7">7</xref>], guidance for geographically tailored screening strategies [<xref ref-type="bibr" rid="ref8">8</xref>], and enhanced adherence to clinical quality improvement initiatives [<xref ref-type="bibr" rid="ref5">5</xref>], less constrained by outpatient scheduling or geographic disparities in health care resource distribution [<xref ref-type="bibr" rid="ref9">9</xref>]. This approach conserves clinician time and reduces operational costs while simultaneously improving the accessibility, flexibility, and scalability of CRC-related health information for patients [<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>Against this backdrop, research on LLMs in CRC has expanded rapidly between 2024 and 2026, spanning the entire care continuum. In screening and early detection, GPT-4 and its successors have been evaluated for risk-stratified counseling and family-history triage for hereditary CRC syndromes [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>], while multiple studies have also explored the clinical utility of LLM-based tools, notably ChatGPT (OpenAI), for preoperative screening consultations and postoperative surveillance monitoring in CRC patients [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. In endoscopy, LLMs have been applied to automate colonoscopy report generation [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. In pathology, LLMs have been used to extract tumor&#x2013;node&#x2013;metastasis (TNM) descriptors and microsatellite instability status [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. Therapeutic decision support has emerged as a particularly active area, with LLM recommendations benchmarked against multidisciplinary tumor board consensus [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. The accelerating volume of these publications makes a focused, structured synthesis both timely and necessary.</p><p>Nevertheless, digital health models are not without limitations, including technically inaccurate outputs attributable to hallucinations [<xref ref-type="bibr" rid="ref21">21</xref>], quality assurance concerns in complex diagnostic and therapeutic recommendations [<xref ref-type="bibr" rid="ref22">22</xref>], and challenges related to model bias, limited generalizability, and the absence of physician empathy [<xref ref-type="bibr" rid="ref23">23</xref>]. The emerging literature also reflects substantial heterogeneity, with findings that vary across studies. Model selection is one key factor [<xref ref-type="bibr" rid="ref24">24</xref>]. Published studies have compared various general-purpose and medically fine-tuned models, with consistent reports distinguishing the performance of GPT-4-class and domain-tuned models from that of earlier or smaller backbones in oncology evaluations, while open-source models offer data-privacy advantages but display variable accuracy across CRC tasks [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. Equally consequential is the choice of prompt engineering strategy: zero-shot prompting, few-shot prompting, chain-of-thought reasoning, retrieval-augmented generation (RAG), and guideline-grounded prompting yield markedly different accuracy on identical CRC questions [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref28">28</xref>], with several studies reporting accuracy gains when few-shot or RAG approaches replace naive zero-shot baselines [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Additional sources of heterogeneity include differences in evaluation rubrics, question framing, and prompt language [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Consequently, 2 studies addressing apparently similar questions can reach opposing conclusions. Amini et al [<xref ref-type="bibr" rid="ref13">13</xref>] assessed the clinical utility of freely available LLMs for colonoscopy surveillance interval recommendations across diverse settings, finding insufficient accuracy and notable limitations. In contrast, Chang et al [<xref ref-type="bibr" rid="ref14">14</xref>], using the more capable GPT-4 model and a guideline-anchored expert panel as reference, concluded that ChatGPT-4 exhibited accuracy comparable to professional gastroenterologists.</p><p>Within the gastroenterological domain, several reviews have mapped LLM applications. Gong conducted a systematic review of LLMs in gastroenterology and gastrointestinal endoscopy, categorizing applications into knowledge-based response evaluation and document automation, with most studies focusing on GPT-series models [<xref ref-type="bibr" rid="ref9">9</xref>]. Omar et al [<xref ref-type="bibr" rid="ref15">15</xref>] reviewed 57 natural language processing (NLP) and LLM studies in gastroenterology and hepatology, confirming improved data extraction from electronic health records (EHRs) but noting persistent challenges in integrating these tools into routine clinical practice. Furthermore, a recent systematic review in lung cancer identified critical methodological limitations in primary LLM studies, notably a reliance on retrospective data and unclear risk of bias [<xref ref-type="bibr" rid="ref33">33</xref>]. Given the fundamental differences in oncology protocols, the specific, multi-stage clinical trajectory of CRC, spanning distinct endoscopic, pathological, and surgical phases, necessitates an isolated, disease-specific appraisal to objectively evaluate LLM viability. However, a conspicuous gap remains: no systematic review has comprehensively evaluated the evidence for LLM applications specifically within the CRC domain. In particular, the information quality of LLM outputs across the full CRC care continuum has been insufficiently addressed in prior systematic reviews. Compounding this limitation, although recent studies have demonstrated that LLMs can achieve clinician-level performance in specific clinical tasks, substantial heterogeneity in model selection, prompt engineering strategies, and evaluation metrics precludes generalizable conclusions [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>Accordingly, this systematic review aims to evaluate the performance of different LLM categories across the full CRC care continuum, address evidence gaps arising from fragmented research practices, and provide a foundation for future research and clinical translation, covering use cases, model types, optimization strategies, limitations, and future directions. Specifically, this review seeks to (1) map LLM applications across the principal clinical domains of CRC management; (2) compare general-purpose, domain-specific, and multimodal LLMs under different prompt engineering and fine-tuning strategies; (3) classify included studies according to their research design and apply corresponding quality appraisal tools to appraise the credibility of individual studies.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Eligibility Criteria</title><p>The eligibility criteria for this review were established according to the PICOS (Population, Intervention, Comparison, Outcome, Study design) framework, as detailed in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>PICOS (Population, Intervention, Comparison, Outcome, Study design) eligibility criteria.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Criteria</td><td align="left" valign="bottom">Definition</td></tr></thead><tbody><tr><td align="left" valign="top">Participants</td><td align="left" valign="top">General population or patients with CRC.</td></tr><tr><td align="left" valign="top">Intervention</td><td align="left" valign="top">Artificial Intelligence, specifically LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> applied in CRC<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> management. These may be applications used by patients or health care providers for auxiliary diagnosis, information extraction, knowledge-based question answering, treatment decision-making, predictive modeling, or scientific research. LLMs are advanced AI<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> systems designed to process complex clinical data, support decision-making, and enable effective communication.</td></tr><tr><td align="left" valign="top">Control</td><td align="left" valign="top">Control (applicable exclusively to comparative study designs): Standard clinical evaluation by health care professionals or conventional non-LLM computational algorithms. Studies without a control group were eligible for inclusion if the other criteria were met.</td></tr><tr><td align="left" valign="top">Outcomes</td><td align="left" valign="top">Outcome measures included: Clinical and performance effectiveness (eg, Accuracy, <italic>F</italic><sub>1</sub>-score, area under the curve, sensitivity, concordance rate) and qualitative/utility measures (eg, response completeness, clarity, comprehensiveness, guideline adherence).</td></tr><tr><td align="left" valign="top">Study types</td><td align="left" valign="top">All study types were considered (eg, exploratory or comparative designs) so long as the original research concept was implemented and tested regarding LLMs and CRC. Nonoriginal research such as books, book chapters, letters, reviews, and conference proceedings were excluded.</td></tr><tr><td align="left" valign="top">Other</td><td align="left" valign="top">Studies were restricted to English language only articles.</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table1fn2"><p><sup>b</sup>CRC: colorectal cancer.</p></fn><fn id="table1fn3"><p><sup>c</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap><p>Discrepancies were resolved by discussion, with arbitration by a third reviewer. This review was conducted following PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) 2020 [<xref ref-type="bibr" rid="ref36">36</xref>], with search reporting per PRISMA-S [<xref ref-type="bibr" rid="ref37">37</xref>] and narrative synthesis per SWiM guidelines [<xref ref-type="bibr" rid="ref38">38</xref>].</p></sec><sec id="s2-2"><title>Information Sources</title><p>Relevant studies were identified by systematically searching 6 electronic databases: PubMed, Web of Science, Embase, Cochrane Library, CINAHL, and Scopus (search cutoff date: April 1, 2026). Each database was searched individually; no multi-database searching on a single platform was performed. No published search filters (eg, validated study design filters) were applied to any database search.</p></sec><sec id="s2-3"><title>Search Strategy</title><p>The search strategy combined Medical Subject Headings (MeSH and EMTREE) and free-text keywords related to CRC and LLMs. These terms were adapted for each database to maximize retrieval sensitivity. Key terms included: &#x201C;colonic neoplasms,&#x201D; &#x201C;colorectal cancer*,&#x201D; &#x201C;large language models,&#x201D; &#x201C;artificial intelligence,&#x201D; &#x201C;LLM,&#x201D; &#x201C;GPT,&#x201D; &#x201C;ChatGPT,&#x201D; &#x201C;Claude,&#x201D; &#x201C;Gemini,&#x201D; and &#x201C;LLaMA.&#x201D; The search process followed the PRISMA Search Strategy Extension [<xref ref-type="bibr" rid="ref20">20</xref>]. The complete search strategy, including specific search queries, applied limits, and the number of records retrieved from each database, is provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The initial search was established and updated through April 1, 2026, to capture the most recent publications prior to data synthesis.</p><p>Regarding the PRISMA-S checklist, certain items were not applicable to our methodology: study registries and regulatory databases were not searched, as research on LLMs in CRC is generally not registered as clinical trials; gray literature, institutional websites, conference proceedings, and preprint servers were not searched; aside from manually screening reference lists, no citation searching tools were used; no additional search methods such as PubMed Related Articles, personal reference libraries, or other database-embedded related-article recommendation features were employed; and stakeholders or content experts were not contacted to identify additional studies, as the designed search was considered sufficiently comprehensive through database coverage alone. Although corresponding authors were contacted via email regarding missing or ambiguous data during the data extraction process, no authors, experts, manufacturers, or other parties were specifically contacted to identify additional studies or unpublished data for inclusion in this review. The search strategy did not undergo formal external peer review (such as the PRESS checklist process) but was cross-checked and finalized by investigators within the research team. A complete PRISMA-S checklist is provided in <xref ref-type="supplementary-material" rid="app5">Checklist 1</xref>.</p></sec><sec id="s2-4"><title>Selection Process</title><p>EndNote X9.3.3 (Clarivate Analytics, US) was used for reference management and automated deduplication, followed by manual verification. Two reviewers (JL and HT) independently screened titles and abstracts, then full texts against eligibility criteria. Discrepancies were resolved by discussion, with arbitration by a third reviewer (QF). Interrater agreement was assessed using the Kappa statistic.</p></sec><sec id="s2-5"><title>Data Collection Process</title><p>Two reviewers (JL and WX) independently extracted data using a predesigned form (WPS Office Excel). Extracted items included: title, first author, year, study design, LLM model, model modality, application scenario, prompt engineering approach, input/output formats, and outcome measures. Interreviewer consistency was calculated; disagreements were resolved by a third reviewer (QF). For missing or ambiguous data, corresponding authors were contacted via email; if unavailable after 2 weeks, items were recorded as &#x201C;not reported&#x201D; and excluded from descriptive analyses. No imputation was applied. For studies reporting multiple outcomes, we gave preference to the primary outcome defined by the authors; if none was specified, we selected the metric most central to the study&#x2019;s objective through consensus between 2 reviewers. For other types of outcomes, we extracted the reported values without modification.</p></sec><sec id="s2-6"><title>Data Items</title><p>To manage the inherent overlap between technical tasks, studies were categorized based on their primary terminal clinical objective. For instance, studies employing information extraction specifically to enable automated TNM staging were classified under &#x201C;Auxiliary Diagnosis&#x201D; rather than &#x201C;Information Extraction&#x201D; to prioritize clinical utility over technical subprocesses.</p></sec><sec id="s2-7"><title>Study Risk of Bias Assessment</title><p>Following Omar and Levkovich [<xref ref-type="bibr" rid="ref39">39</xref>], the included studies were classified and evaluated based on the assessment design and outcome indicators of the studies rather than their clinical application fields. QUADAS-2 (Quality Assessment of Diagnostic Accuracy Studies-2) [<xref ref-type="bibr" rid="ref40">40</xref>] was applied for diagnostic accuracy studies validating LLM performance against histopathological diagnosis, endoscopist consensus, or clinical guidelines. PROBAST (prediction model risk of bias assessment tool) [<xref ref-type="bibr" rid="ref41">41</xref>] was applied for prediction model studies focusing on the development and validation of LLM-based predictive models. ROBINS-I (Risk of Bias in Nonrandomized Studies - of Interventions) [<xref ref-type="bibr" rid="ref42">42</xref>] was applied for nonrandomized intervention studies evaluating the LLM application effect or clinical value, including information extraction and knowledge-based tasks. Study classifications and corresponding tools are detailed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><p>Given that LLM studies differ from conventional clinical trials, 2 oncology experts (QF) made minor framework-preserving adaptations to each tool; specific adaptations are documented in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. Assessment was conducted independently by 2 researchers (JL and HT), with a third (WX) resolving disagreements. Final results were reviewed by an expert (QF). Interrater agreement was evaluated using the Kappa statistic. Overall evidence strength was evaluated considering study quality, consistency of findings, and methodological limitations.</p></sec><sec id="s2-8"><title>Synthesis Methods</title><p>Given the anticipated heterogeneity in clinical tasks, study designs, and outcome constructs, narrative synthesis following SWiM reporting guidelines [<xref ref-type="bibr" rid="ref38">38</xref>] was planned a priori rather than quantitative meta-analysis. Meta-analysis was not conducted for four reasons: (1) substantial heterogeneity across fundamentally different clinical tasks, rendering pooled estimates uninterpretable; (2) a high proportion of studies rated at moderate, serious, or high risk of bias; (3) fewer than 5 studies within any subgroup sharing comparable task definitions, input modalities, and reference standards; and (4) marked inconsistency in outcome measures precluding standardized effect size extraction.</p></sec><sec id="s2-9"><title>Reporting Bias Assessment</title><p>This systematic review employed a narrative synthesis and did not perform statistical tests for publication bias. Given the absence of a quantitative meta-analysis and the substantial heterogeneity in study design and outcome reporting across included studies, methods such as funnel plots were considered inapplicable. During evidence synthesis and result interpretation, the research team conducted a qualitative assessment of potential reporting bias. By comparing the consistency between study objectives, methods, and reported outcomes, and by incorporating study registration information (where available) and author explanations, the team cautiously discussed the potential impact of missing results on study conclusions.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Study Selection</title><p>A total of 8880 records were retrieved (PubMed: 4047; Embase: 1423; Web of Science: 3061; Cochrane Library: 43; Scopus: 43; CINAHL: 263). After automated and manual deduplication using EndNote X9.3.3, 6260 unique records were identified. Following title/abstract screening, 2533 full-text articles were assessed, and 37 studies met the inclusion criteria. The screening-stage Kappa was 0.85. The screening process is presented in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) 2020 flow diagram of the study selection process for examining the role of LLMs in colorectal cancer. CRC: colorectal cancer; LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e89862_fig01.png"/></fig></sec><sec id="s3-2"><title>Study Characteristics</title><p>The data extraction consistency rate was 0.97. All 37 studies were published between 2023 and 2026, 2 in 2023 [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref43">43</xref>], 11 in 2024 [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref50">50</xref>], 22 in 2025 [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref51">51</xref>-<xref ref-type="bibr" rid="ref67">67</xref>], and 2 in 2026 [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref68">68</xref>]. Studies primarily originated from China [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref54">54</xref>-<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref65">65</xref>] and the United States [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref68">68</xref>], with others from Italy [<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref63">63</xref>], Germany [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref61">61</xref>], Singapore [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref51">51</xref>], Israel [<xref ref-type="bibr" rid="ref43">43</xref>], Switzerland [<xref ref-type="bibr" rid="ref67">67</xref>], Spain [<xref ref-type="bibr" rid="ref52">52</xref>], Turkey [<xref ref-type="bibr" rid="ref48">48</xref>], the United Kingdom [<xref ref-type="bibr" rid="ref44">44</xref>], South Korea [<xref ref-type="bibr" rid="ref69">69</xref>], and multinational collaborations [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Application domains included auxiliary diagnosis [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref65">65</xref>], information extraction [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref69">69</xref>], knowledge-based question answering [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>], treatment decision-making [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref68">68</xref>], predictive modeling [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>], scientific research [<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref66">66</xref>], and aided nursing [<xref ref-type="bibr" rid="ref60">60</xref>].</p><p>The LLMs used varied widely, with the most frequent being OpenAI&#x2019;s GPT series. Other models included Google&#x2019;s Gemini, Anthropic&#x2019;s Claude, Meta&#x2019;s LLaMA series, as well as DeepSeek, GLM, and Qwen, among others. The best-performing models identified in comparative studies are summarized in <xref ref-type="table" rid="table2">Table 2</xref>. The results suggest that models such as GPT-4, GPT-4o, and Claude 2.1 showed relatively favorable performance in some tasks [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref68">68</xref>,<xref ref-type="bibr" rid="ref69">69</xref>]; o3-mini reportedly showed comparatively higher intra-model stability and expert concordance among reasoning-oriented models for multidisciplinary team decision simulation [<xref ref-type="bibr" rid="ref20">20</xref>]. However, for specific tasks, lightweight models or domain-specialized models may also perform optimally [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. A summary of these details is provided in <xref ref-type="table" rid="table2">Table 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of included sources. Pure LLM: text-only language model processing textual inputs exclusively. Multimodal VLM: vision-language model capable of processing both textual and visual inputs (eg, GPT-4V, GPT-4o with image input).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom">Country</td><td align="left" valign="bottom">LLMs<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> used</td><td align="left" valign="bottom">Model type</td><td align="left" valign="bottom">Application domain</td><td align="left" valign="bottom">Best performer</td></tr></thead><tbody><tr><td align="left" valign="top">Zeng, 2025 [<xref ref-type="bibr" rid="ref8">8</xref>]</td><td align="left" valign="top">Multi-national</td><td align="left" valign="top">ChatGPT-4.5</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">Zeng, 2025 [<xref ref-type="bibr" rid="ref56">56</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-4o, DeepSeek</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Schmutz, 2025 [<xref ref-type="bibr" rid="ref61">61</xref>]</td><td align="left" valign="top">Germany</td><td align="left" valign="top">ChatGPT 4.0</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">ChatGPT 4.0</td></tr><tr><td align="left" valign="top">Chatziisaak, 2025 [<xref ref-type="bibr" rid="ref67">67</xref>]</td><td align="left" valign="top">Switzerland</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Horesh, 2025 [<xref ref-type="bibr" rid="ref53">53</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">ChatGP -3.5</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Kaiser, 2024 [<xref ref-type="bibr" rid="ref47">47</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">ChatGPT-3.5, Microsoft Copilot</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Garg, 2026 [<xref ref-type="bibr" rid="ref68">68</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">ChatGPT-4o</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">GPT-4o</td></tr><tr><td align="left" valign="top">Qu, 2026 [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-o3-mini, DeepSeek-R1, Qwen qwq-plus</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Treatment Decision</td><td align="left" valign="top">o3-mini</td></tr><tr><td align="left" valign="top">Diaz, 2025 [<xref ref-type="bibr" rid="ref66">66</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">AI-HOPE (LLaMA 3-based)</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Scientific Research</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Yang, 2025 [<xref ref-type="bibr" rid="ref58">58</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">LLaMA 3</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Scientific Research</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Yang, 2025 [<xref ref-type="bibr" rid="ref54">54</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">BGE-M3, XGBoost</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Predictive Modeling</td><td align="left" valign="top">XGBoost</td></tr><tr><td align="left" valign="top">Kim, 2025 [<xref ref-type="bibr" rid="ref51">51</xref>]</td><td align="left" valign="top">Singapore</td><td align="left" valign="top">BioBERT-Large, RadImageNet, 3D ResNet</td><td align="left" valign="top">Multimodal VLM</td><td align="left" valign="top">Predictive Modeling</td><td align="left" valign="top">BioBERT-Large</td></tr><tr><td align="left" valign="top">Lim, 2024 [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">Singapore</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">GPT-4</td></tr><tr><td align="left" valign="top">Hu, 2025 [<xref ref-type="bibr" rid="ref64">64</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-4.5</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">ChatGPT-4.5</td></tr><tr><td align="left" valign="top">Peng, 2024 [<xref ref-type="bibr" rid="ref7">7</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-3.5</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Wang, 2024 [<xref ref-type="bibr" rid="ref50">50</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">GPT-3.5-turbo</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Zhang, 2025 [<xref ref-type="bibr" rid="ref55">55</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-4o, Claude 3.5, DeepSeek</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">ChatGPT-4o</td></tr><tr><td align="left" valign="top">Zhou, 2024 [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT, Doctor GPT, Llama-2-70B, Mixtral-8 &#x00D7; 7B, Bard, Claude 2.1</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">Claude 2.1</td></tr><tr><td align="left" valign="top">Gorelik, 2023 [<xref ref-type="bibr" rid="ref43">43</xref>]</td><td align="left" valign="top">Israel</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Maida, 2025 [<xref ref-type="bibr" rid="ref63">63</xref>]</td><td align="left" valign="top">Italy</td><td align="left" valign="top">ChatGPT-4o</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Maida, 2025 [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top">Multi-national</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Emile, 2023 [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top">Multi-national</td><td align="left" valign="top">ChatGPT-3.5</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Kepez, 2024 [<xref ref-type="bibr" rid="ref48">48</xref>]</td><td align="left" valign="top">Turkey</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">ChatGPT-4</td></tr><tr><td align="left" valign="top">Atarere, 2024 [<xref ref-type="bibr" rid="ref45">45</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">ChatGPT, BingChat, YouChat</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Knowledge QA</td><td align="left" valign="top">ChatGPT, YouChat</td></tr><tr><td align="left" valign="top">Yu, 2025 [<xref ref-type="bibr" rid="ref57">57</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">Gemini, GPT-4, GPT-4o, Claude, Llama, DeepSeek, GLM, Qwen</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Information Extraction</td><td align="left" valign="top">GPT-4</td></tr><tr><td align="left" valign="top">Chizhikova, 2025 [<xref ref-type="bibr" rid="ref52">52</xref>]</td><td align="left" valign="top">Spain</td><td align="left" valign="top">RoBERTa</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Information Extraction</td><td align="left" valign="top">Task-specific models</td></tr><tr><td align="left" valign="top">Alzaid, 2024 [<xref ref-type="bibr" rid="ref44">44</xref>]</td><td align="left" valign="top">UK</td><td align="left" valign="top">ChatGPT-4 Turbo, GPT-4V</td><td align="left" valign="top">Multimodal VLM</td><td align="left" valign="top">Information Extraction</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Johnson, 2025 [<xref ref-type="bibr" rid="ref17">17</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">Gemma-2-9B-It-SPPO, Llama-3-8B-Instruct</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Information Extraction</td><td align="left" valign="top">Gemma-2</td></tr><tr><td align="left" valign="top">Kim, 2025 [<xref ref-type="bibr" rid="ref69">69</xref>]</td><td align="left" valign="top">South Korea</td><td align="left" valign="top">GPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Information Extraction</td><td align="left" valign="top">GPT-4</td></tr><tr><td align="left" valign="top">Ding, 2025 [<xref ref-type="bibr" rid="ref65">65</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Multimodal VLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Liu, 2024 [<xref ref-type="bibr" rid="ref49">49</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT-3.5, ChatGPT-4.0</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">GPT-4.0</td></tr><tr><td align="left" valign="top">Wang, 2025 [<xref ref-type="bibr" rid="ref59">59</xref>]</td><td align="left" valign="top">China</td><td align="left" valign="top">ChatGPT, Claude, ERNie, SAM</td><td align="left" valign="top">Multimodal VLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">Ferber, 2024 [<xref ref-type="bibr" rid="ref46">46</xref>]</td><td align="left" valign="top">Germany</td><td align="left" valign="top">ChatGPT-4V</td><td align="left" valign="top">Multimodal VLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">GPT-4V</td></tr><tr><td align="left" valign="top">Massimi, 2025 [<xref ref-type="bibr" rid="ref62">62</xref>]</td><td align="left" valign="top">Italy</td><td align="left" valign="top">ChatGPT-4o</td><td align="left" valign="top">Multimodal VLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">GPT-4o</td></tr><tr><td align="left" valign="top">Amini, 2025 [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">GPT-3.5-turbo, Bard (PaLM 2)</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">ChatGPT-3.5</td></tr><tr><td align="left" valign="top">Chang, 2024 [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Auxiliary Diagnosis</td><td align="left" valign="top">ChatGPT-4</td></tr><tr><td align="left" valign="top">Sehgal, 2025 [<xref ref-type="bibr" rid="ref60">60</xref>]</td><td align="left" valign="top">United States</td><td align="left" valign="top">ChatGPT-4.1</td><td align="left" valign="top">Pure LLM</td><td align="left" valign="top">Aided Nursing</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table2fn2"><p><sup>b</sup>No intermodel comparison was performed or the metric is not applicable.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Prompt Engineering and Model Training</title><p>The data extraction consistency rate was 0.97. We synthesized prompt engineering strategies, model inputs/outputs, and evaluation metrics (<xref ref-type="table" rid="table3">Table 3</xref>). Five studies [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref63">63</xref>] did not explicitly describe prompting strategies, employing basic queries primarily for educational purposes. Thirty-two studies described distinct methods, including instruction templates and instructional prompts [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref64">64</xref>-<xref ref-type="bibr" rid="ref69">69</xref>], zero-shot learning [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref64">64</xref>], few-shot learning [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref56">56</xref>], fine-tuning [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref54">54</xref>], and hybrid approaches [<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref68">68</xref>]. Training data were text-based in 33 studies [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref47">47</xref>-<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>-<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref66">66</xref>-<xref ref-type="bibr" rid="ref69">69</xref>], image-based in 2 studies [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref62">62</xref>], and multimodal in 2 studies [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref65">65</xref>]. Common outcome metrics included accuracy [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref48">48</xref>-<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref55">55</xref>-<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref65">65</xref>,<xref ref-type="bibr" rid="ref68">68</xref>], <italic>F</italic><sub>1</sub>-score [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref65">65</xref>], area under the curve [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref59">59</xref>], sensitivity [<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref65">65</xref>], and concordance rate [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref59">59</xref>-<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref68">68</xref>]. A categorized summary is provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Prompt engineering and model training.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Study</td><td align="left" valign="bottom">Prompt method or content</td><td align="left" valign="bottom">Model input</td><td align="left" valign="bottom">Model output</td><td align="left" valign="bottom">Outcome indicators</td></tr></thead><tbody><tr><td align="left" valign="top">Zeng, 2025 [<xref ref-type="bibr" rid="ref8">8</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Standardized patient cases</td><td align="left" valign="top">Screening and monitoring recommendations</td><td align="left" valign="top">Correct/partially correct/incorrect proportions; descriptive statistics</td></tr><tr><td align="left" valign="top">Amini, 2025 [<xref ref-type="bibr" rid="ref13">13</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Colonoscopy reports, pathology, history, family history</td><td align="left" valign="top">Colonoscopy interval recommendation</td><td align="left" valign="top">Agreement percentage, Fleiss&#x2019; kappa, McNemar test</td></tr><tr><td align="left" valign="top">Chang, 2024 [<xref ref-type="bibr" rid="ref14">14</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Deidentified clinical data, colonoscopy reports, pathology reports</td><td align="left" valign="top">Follow-up colonoscopy interval suggestions</td><td align="left" valign="top">Agreement rate, Fleiss kappa</td></tr><tr><td align="left" valign="top">Johnson, 2025 [<xref ref-type="bibr" rid="ref17">17</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Pathology report text</td><td align="left" valign="top">Yes/no answer</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score, PPV<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>, NPV<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>, sensitivity, specificity, MCC<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">Lim, 2024 [<xref ref-type="bibr" rid="ref28">28</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Patient scenario descriptions</td><td align="left" valign="top">Colonoscopy interval recommendations</td><td align="left" valign="top">Correct interval percentage, hallucination rate</td></tr><tr><td align="left" valign="top">Gorelik, 2023 [<xref ref-type="bibr" rid="ref43">43</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Structured endoscopy reports &#x0026; free-text clinical notes</td><td align="left" valign="top">Guideline-based next-step recommendations; Patient result explanation letters</td><td align="left" valign="top">Guideline adherence, accuracy, Fleiss&#x2019; kappa</td></tr><tr><td align="left" valign="top">Alzaid, 2024 [<xref ref-type="bibr" rid="ref44">44</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Unstructured pathology reports</td><td align="left" valign="top">Structured JSON report with confidence</td><td align="left" valign="top">Accuracy, Kappa, AUROC<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">Kepez, 2024 [<xref ref-type="bibr" rid="ref48">48</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">20 common questions on colon cancer</td><td align="left" valign="top">Answer text for each question</td><td align="left" valign="top">DISCERN, GQS, JAMA criteria, Flesch-Kincaid readability, SAM, HITS, VPI, HONcode</td></tr><tr><td align="left" valign="top">Zhang, 2025 [<xref ref-type="bibr" rid="ref55">55</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Chinese Society of Clinical Oncology guideline standards / instructions</td><td align="left" valign="top">Colorectal cancer screening educational text</td><td align="left" valign="top">Accuracy, clarity, rigor scores</td></tr><tr><td align="left" valign="top">Yang, 2025 [<xref ref-type="bibr" rid="ref58">58</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Natural language queries on clinical genomic data</td><td align="left" valign="top">Mutation profiles, survival curves, odds ratios</td><td align="left" valign="top"><italic>P</italic> values, hazard ratios, odds ratios</td></tr><tr><td align="left" valign="top">Wang, 2025 [<xref ref-type="bibr" rid="ref59">59</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Free-text colonoscopy reports</td><td align="left" valign="top">Report-level labels</td><td align="left" valign="top">Accuracy, average precision, dice similarity coefficient, AUC<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">Sehgal, 2025 [<xref ref-type="bibr" rid="ref60">60</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Self-reported demographics</td><td align="left" valign="top">AI-generated personalized messages or chatbot dialogues</td><td align="left" valign="top">Intent score change, Cohen <italic>d</italic>, <italic>P</italic> values, OR<sup><xref ref-type="table-fn" rid="table3fn6">f</xref></sup>, Flesch-Kincaid readability</td></tr><tr><td align="left" valign="top">Schmutz, 2025 [<xref ref-type="bibr" rid="ref61">61</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Clinical patient summaries and pathology reports</td><td align="left" valign="top">Treatment/diagnostic recommendations</td><td align="left" valign="top">Recommendation type, information density, consistency, level of evidence, time efficiency</td></tr><tr><td align="left" valign="top">Massimi, 2025 [<xref ref-type="bibr" rid="ref62">62</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Colonoscopy video frames</td><td align="left" valign="top">Paris classification</td><td align="left" valign="top">Accuracy, sensitivity, specificity, Fleiss&#x2019; kappa</td></tr><tr><td align="left" valign="top">Ding, 2025 [<xref ref-type="bibr" rid="ref65">65</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Pathology images and text prompts</td><td align="left" valign="top">Tissue origin, lesion classification, diagnosis</td><td align="left" valign="top">Accuracy, sensitivity, specificity, PPV, NPV, <italic>F</italic><sub>1</sub>-score, Kappa, ICC<sup><xref ref-type="table-fn" rid="table3fn7">g</xref></sup></td></tr><tr><td align="left" valign="top">Diaz, 2025 [<xref ref-type="bibr" rid="ref66">66</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Natural language queries for scanning and validating clinical genomic datasets</td><td align="left" valign="top">Survival analysis results, mutation frequency comparisons, statistical significance</td><td align="left" valign="top"><italic>P</italic> values, odds ratios, survival rates</td></tr><tr><td align="left" valign="top">Chatziisaak, 2025 [<xref ref-type="bibr" rid="ref67">67</xref>]</td><td align="left" valign="top">Instruction template</td><td align="left" valign="top">Patient clinical data</td><td align="left" valign="top">Treatment recommendation</td><td align="left" valign="top">Consistency, chi-square test</td></tr><tr><td align="left" valign="top">Qu, 2026 [<xref ref-type="bibr" rid="ref20">20</xref>]</td><td align="left" valign="top">Instruction template; Multi-role prompting</td><td align="left" valign="top">Structured variables and free-text summaries from clinical records</td><td align="left" valign="top">Four-category treatment classification code</td><td align="left" valign="top">Intra-model agreement; expert-model concordance, Cohen &#x03BA;</td></tr><tr><td align="left" valign="top">Garg, 2026 [<xref ref-type="bibr" rid="ref68">68</xref>]</td><td align="left" valign="top">Instruction template; Role prompting; Few-shot; Chain-of-thought; JSON schema enforcement</td><td align="left" valign="top">Colonoscopy reports, pathology reports, patient family history and preoperative diagnoses</td><td align="left" valign="top">Structured clinical entities and 2020 USMSTF-based surveillance interval recommendations; 2024 ACG/ASGE quality indicators</td><td align="left" valign="top">Case-level accuracy, Cohen &#x03BA;; Fleiss&#x2019; &#x03BA;; ADR, SSLDR, cecal intubation rate, bowel prep adequacy</td></tr><tr><td align="left" valign="top">Kim, 2025 [<xref ref-type="bibr" rid="ref69">69</xref>]</td><td align="left" valign="top">Instruction template; Role prompting</td><td align="left" valign="top">Unstructured preoperative abdominal CT / rectal MRI reports</td><td align="left" valign="top">Lesion location and cTNM stage and reasoning</td><td align="left" valign="top">Lesion location accuracy</td></tr><tr><td align="left" valign="top">Kim, 2025 [<xref ref-type="bibr" rid="ref51">51</xref>]</td><td align="left" valign="top">Fine-tuning</td><td align="left" valign="top">CT images and radiology report texts</td><td align="left" valign="top">Binary NAR score classification</td><td align="left" valign="top">AUC</td></tr><tr><td align="left" valign="top">Chizhikova, 2025 [<xref ref-type="bibr" rid="ref52">52</xref>]</td><td align="left" valign="top">Fine-tuning</td><td align="left" valign="top">Spanish colon MRI report texts, numerical features, categorical features</td><td align="left" valign="top">TNM<sup><xref ref-type="table-fn" rid="table3fn8">h</xref></sup> staging</td><td align="left" valign="top">Accuracy, macro <italic>F</italic><sub>1</sub>-score, precision, recall</td></tr><tr><td align="left" valign="top">Yang, 2025 [<xref ref-type="bibr" rid="ref54">54</xref>]</td><td align="left" valign="top">Fine-tuning</td><td align="left" valign="top">Clinical EHR<sup><xref ref-type="table-fn" rid="table3fn9">i</xref></sup> data</td><td align="left" valign="top">Binary colorectal adenoma risk</td><td align="left" valign="top">AUC, sensitivity, specificity, <italic>F</italic><sub>1</sub>-score, PPV, NPV, mean lead time</td></tr><tr><td align="left" valign="top">Ferber, 2024 [<xref ref-type="bibr" rid="ref46">46</xref>]</td><td align="left" valign="top">Few-shot</td><td align="left" valign="top">Cancer pathology images</td><td align="left" valign="top">Image classification labels</td><td align="left" valign="top">Accuracy, confidence interval, recall</td></tr><tr><td align="left" valign="top">Zeng, 2025 [<xref ref-type="bibr" rid="ref56">56</xref>]</td><td align="left" valign="top">Few-shot; Role prompting; Context learning</td><td align="left" valign="top">Real-world pathology report text</td><td align="left" valign="top">Recommendation on need for additional surgery</td><td align="left" valign="top">Accuracy; guideline consistency proportion</td></tr><tr><td align="left" valign="top">Peng, 2024 [<xref ref-type="bibr" rid="ref7">7</xref>]</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Medical questions from books</td><td align="left" valign="top">Colorectal cancer-related answers</td><td align="left" valign="top">Accuracy, comprehensiveness scores</td></tr><tr><td align="left" valign="top">Zhou, 2024 [<xref ref-type="bibr" rid="ref25">25</xref>]</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">150 CRC-related<sup><xref ref-type="table-fn" rid="table3fn10">j</xref></sup> closed-ended questions</td><td align="left" valign="top">Yes/no answers</td><td align="left" valign="top">Accuracy</td></tr><tr><td align="left" valign="top">Liu, 2024 [<xref ref-type="bibr" rid="ref49">49</xref>]</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Colorectal cancer case report texts</td><td align="left" valign="top">Primary/secondary diagnoses</td><td align="left" valign="top">Accuracy</td></tr><tr><td align="left" valign="top">Wang, 2024 [<xref ref-type="bibr" rid="ref50">50</xref>]</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Pathology report text and related questions</td><td align="left" valign="top">Answers to pathology questions</td><td align="left" valign="top">7-point Likert scale</td></tr><tr><td align="left" valign="top">Horesh, 2025 [<xref ref-type="bibr" rid="ref53">53</xref>]</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Clinical patient summaries</td><td align="left" valign="top">Next best management recommendation</td><td align="left" valign="top">Consistency with multidisciplinary team decisions, reasonableness score, interrater reliability</td></tr><tr><td align="left" valign="top">Yu, 2025 [<xref ref-type="bibr" rid="ref57">57</xref>]</td><td align="left" valign="top">Zero-shot; Chain-of-thought</td><td align="left" valign="top">Endoscopy/colonoscopy report texts</td><td align="left" valign="top">Structured JSON including lesion location, features, layer structure, distribution, diagnosis</td><td align="left" valign="top">Precision, recall, <italic>F</italic><sub>1</sub>-score, accuracy</td></tr><tr><td align="left" valign="top">Hu, 2025 [<xref ref-type="bibr" rid="ref64">64</xref>]</td><td align="left" valign="top">Zero-shot</td><td align="left" valign="top">Patient question texts</td><td align="left" valign="top">Answer texts</td><td align="left" valign="top">Accuracy, completeness, clarity scores</td></tr><tr><td align="left" valign="top">Maida, 2025 [<xref ref-type="bibr" rid="ref10">10</xref>]</td><td align="left" valign="top"><italic>&#x2014;</italic><sup><xref ref-type="table-fn" rid="table3fn11">k</xref></sup></td><td align="left" valign="top">15 questions on colorectal cancer screening</td><td align="left" valign="top">Text answers to questions</td><td align="left" valign="top">Accuracy, completeness, clarity scores</td></tr><tr><td align="left" valign="top">Emile, 2023 [<xref ref-type="bibr" rid="ref22">22</xref>]</td><td align="left" valign="top"><italic>&#x2014;</italic></td><td align="left" valign="top">38 common questions on CRC prevention, diagnosis, management</td><td align="left" valign="top">Text answers</td><td align="left" valign="top">Expert consensus; consistency with guidelines</td></tr><tr><td align="left" valign="top">Atarere, 2024 [<xref ref-type="bibr" rid="ref45">45</xref>]</td><td align="left" valign="top"><italic>&#x2014;</italic></td><td align="left" valign="top">15 questions on CRC screening concepts and 5 experience-based questions</td><td align="left" valign="top">Response appropriateness</td><td align="left" valign="top">Appropriateness rating</td></tr><tr><td align="left" valign="top">Kaiser, 2024 [<xref ref-type="bibr" rid="ref47">47</xref>]</td><td align="left" valign="top"><italic>&#x2014;</italic></td><td align="left" valign="top">Clinical scenario questions on next management</td><td align="left" valign="top">Text recommendations for clinical questions</td><td align="left" valign="top">Accuracy score, consistency, verbosity</td></tr><tr><td align="left" valign="top">Maida, 2025 [<xref ref-type="bibr" rid="ref63">63</xref>]</td><td align="left" valign="top"><italic>&#x2014;</italic></td><td align="left" valign="top">Patient queries</td><td align="left" valign="top">ChatGPT-generated answers</td><td align="left" valign="top">Expert scores, patient scores</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>PPV: positive predictive value.</p></fn><fn id="table3fn2"><p><sup>b</sup>NPV: negative predictive value.</p></fn><fn id="table3fn3"><p><sup>c</sup>MCC: Matthews correlation coefficient.</p></fn><fn id="table3fn4"><p><sup>d</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table3fn5"><p><sup>e</sup>AUC: area under the curve.</p></fn><fn id="table3fn6"><p><sup>f</sup>OR: odds ratio.</p></fn><fn id="table3fn7"><p><sup>g</sup>ICC: intraclass correlation coefficient.</p></fn><fn id="table3fn8"><p><sup>h</sup>TNM: tumor&#x2013;node&#x2013;metastasis.</p></fn><fn id="table3fn9"><p><sup>i</sup>EHR: electronic health record.</p></fn><fn id="table3fn10"><p><sup>j</sup>CRC: colorectal cancer.</p></fn><fn id="table3fn11"><p><sup>k</sup>Prompt method was not explicitly reported.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Risk of Bias in Studies</title><p>The included studies were categorized by research objective, and quality was assessed using the corresponding appraisal tool. The kappa value between the 2 reviewers was 0.95. Two predictive modeling studies [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>] were evaluated using PROBAST (<xref ref-type="fig" rid="figure2">Figure 2A</xref>); both showed low risk of bias across the participants, predictors, and outcome domains, but one exhibited high risk of bias in the analysis domain. Eighteen diagnostic studies [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref65">65</xref>,<xref ref-type="bibr" rid="ref67">67</xref>,<xref ref-type="bibr" rid="ref68">68</xref>] were assessed using QUADAS-2 (<xref ref-type="fig" rid="figure2">Figure 2B</xref>); while most demonstrated acceptable applicability, risk of bias in the patient selection domain was frequently unclear or high. Seventeen intervention studies [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref69">69</xref>] were appraised using ROBINS-I (<xref ref-type="fig" rid="figure2">Figure 2C</xref>); risk of bias was predominantly low for participant selection, deviations from intended interventions, and missing data, but moderate to serious for outcome measurement and classification of interventions.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>(A) The quality appraisal for 2 predictive studies with PROBAST (prediction model risk of bias assessment tool). (B) The quality appraisal for 18 diagnostic studies with QUADAS-2 (Quality Assessment of Diagnostic Accuracy Studies-2). (C) The quality appraisal for 17 intervention trials with ROBINS-I (Risk of Bias in Nonrandomized Studies - of Interventions).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e89862_fig02.png"/></fig><p>Overall, 27 of the 37 included studies were rated above low risk of bias: 6 as high or serious [<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref57">57</xref>,<xref ref-type="bibr" rid="ref59">59</xref>], 7 as unclear [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref65">65</xref>], and 14 as moderate [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref66">66</xref>], while 10 were rated as low [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref67">67</xref>-<xref ref-type="bibr" rid="ref69">69</xref>]. The most problematic domains across tools were outcome measurement [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref60">60</xref>-<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref68">68</xref>] and patient selection [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref59">59</xref>,<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref65">65</xref>]. Given these recurring concerns, particularly regarding blinding [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref60">60</xref>,<xref ref-type="bibr" rid="ref61">61</xref>], outcome measurement [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref50">50</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref60">60</xref>-<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref64">64</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref68">68</xref>], and confounding [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref44">44</xref>-<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref60">60</xref>-<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref66">66</xref>,<xref ref-type="bibr" rid="ref68">68</xref>], and the considerable heterogeneity in clinical tasks, LLM models, and outcome metrics, the overall certainty of evidence was judged as moderate to low. Quantitative meta-analysis was not feasible; even within the largest subgroup, fewer than 5 studies were sufficiently aligned in task definition, input modality, and reference standard to permit reliable pooling. A narrative synthesis was therefore adopted, and the findings should be interpreted with caution.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Through a comprehensive analysis of 37 studies, we identified 5 primary application domains of LLMs in CRC diagnosis and treatment: auxiliary diagnosis, information extraction, knowledge-based question-answering and patient education, treatment decision support, and scientific research and predictive modeling (<xref ref-type="table" rid="table2">Table 2</xref>). These domains are often interconnected in clinical practice. For instance, information extraction frequently provides structured data to support diagnostic processes [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref57">57</xref>], while knowledge-based question-answering is widely applied in scientific communication and patient education [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref63">63</xref>,<xref ref-type="bibr" rid="ref64">64</xref>].</p></sec><sec id="s4-2"><title>Applications of LLMs in CRC</title><p>LLMs enable the automated extraction of clinical features through NLP [<xref ref-type="bibr" rid="ref70">70</xref>]. Multiple studies have utilized LLMs to extract key information from EHRs [<xref ref-type="bibr" rid="ref17">17</xref>], endoscopy reports [<xref ref-type="bibr" rid="ref27">27</xref>], radiology reports [<xref ref-type="bibr" rid="ref25">25</xref>], and pathology reports [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref52">52</xref>]. This capability assists not only in clinical staging and histological classification [<xref ref-type="bibr" rid="ref71">71</xref>] but also in predicting disease progression and treatment response [<xref ref-type="bibr" rid="ref17">17</xref>]. For instance, lymph node metastasis assessment based on MRI reports [<xref ref-type="bibr" rid="ref51">51</xref>] and tumor progression prediction from radiology reports [<xref ref-type="bibr" rid="ref72">72</xref>] have shown promising accuracy. These advancements underscore the significant value of LLMs in early CRC screening. Early diagnosis can effectively improve survival rates [<xref ref-type="bibr" rid="ref25">25</xref>], and mass screening achieves a high detection rate for early-stage lesions [<xref ref-type="bibr" rid="ref73">73</xref>]. Wang leveraged LLMs to automatically extract knowledge from colonoscopy image-text records, enabling polyp detection and segmentation without manual annotation, thereby offering a novel approach to screening automation [<xref ref-type="bibr" rid="ref59">59</xref>]. A systematic review of LLMs in gastroenterology similarly demonstrated the potential applications of LLMs in gastrointestinal endoscopy and precancerous lesion screening [<xref ref-type="bibr" rid="ref74">74</xref>]. Despite challenges such as insufficient extraction performance for complex tasks and hallucinations reporting a lower accuracy of 55% for LLMs in classifying pedunculated polyps, indicating they cannot yet fully replace endoscopic experts [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref62">62</xref>], we remain optimistic about their future performance in assisting CRC diagnosis and early screening. This optimism is fueled by ongoing advancements in multimodal integration [<xref ref-type="bibr" rid="ref75">75</xref>], the development of domain-specific models [<xref ref-type="bibr" rid="ref46">46</xref>,<xref ref-type="bibr" rid="ref76">76</xref>], and the continuous optimization of training data [<xref ref-type="bibr" rid="ref34">34</xref>].</p><p>Leveraging their strong interactive capabilities and extensive knowledge, LLMs are widely evaluated for CRC medical question-answering and patient education [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Furthermore, advancing multimodal models now enable LLMs to jointly analyze medical images and text, offering CRC diagnostic and therapeutic suggestions in controlled settings [<xref ref-type="bibr" rid="ref6">6</xref>]. Gong has recently emphasized that multimodal fusion has emerged as the dominant next-generation development trend for gastrointestinal artificial intelligence [<xref ref-type="bibr" rid="ref9">9</xref>]; however, this important technological milestone has not yet received adequate attention in available systematic reviews. Ferber demonstrated that multimodal LLMs applying in-context learning achieved near-pathologist-level classification of cancer pathology images [<xref ref-type="bibr" rid="ref46">46</xref>], and Kim [<xref ref-type="bibr" rid="ref51">51</xref>] showed that combined LLM and vision deep learning architectures outperformed either modality alone for neoadjuvant rectal score prediction, which preliminarily suggests the potential of multimodal LLMs, as they can reach a level close to that of pathologists when processing pathological image classification and clinical prediction tasks, and outperform single-modality models. Despite this progress, the diagnostic accuracy of current multimodal models on morphologically complex tasks remains constrained [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. This reinforces the prevailing clinical consensus that current LLMs must be deployed strictly as decision-support adjuncts rather than autonomous diagnostic agents, thereby mitigating the significant clinical risks associated with automation bias and diagnostic delay [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref49">49</xref>]. Furthermore, extraction performance varied markedly, dictated by underlying model architecture and optimization strategy. GPT-4, augmented with multi-strategy prompting, appeared to outperform zero-shot baselines for colonoscopy report extraction [<xref ref-type="bibr" rid="ref57">57</xref>], while biomedical pretrained RoBERTa showed better performance than general-purpose GPT models for TNM staging in the available evidence from Spanish-language reports [<xref ref-type="bibr" rid="ref52">52</xref>]. This discrepancy unequivocally indicates that domain-adaptive and language-specific pretraining confers fundamental structural semantic advantages that advanced prompt engineering alone cannot replicate [<xref ref-type="bibr" rid="ref44">44</xref>], consistent with recent evaluations where specialized models exhibited superior performance within data-constrained clinical settings [<xref ref-type="bibr" rid="ref77">77</xref>]. Nonetheless, the majority of this evidence is derived from retrospective analyses and single-center validations, with a notable paucity of prospective, multicenter clinical trials to confirm generalizability and real-world efficacy [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref66">66</xref>].</p><p>The NLP and named entity recognition capabilities of LLM extend their utility beyond direct clinical support for practitioners and patients, substantially improving the efficacy of medical research workflows [<xref ref-type="bibr" rid="ref78">78</xref>]. In the domain of data extraction and analysis, Johnson leveraged the Gemma-2 model to accurately identify and extract key pathological diagnostic entities&#x2014;such as dysplasia, high-grade dysplasia/adenocarcinoma, and invasive carcinoma&#x2014;from unstructured pathology reports [<xref ref-type="bibr" rid="ref17">17</xref>]. This high accuracy aligns robustly with Chen et al [<xref ref-type="bibr" rid="ref6">6</xref>], who demonstrated comparable reliability in extracting oncological variables from EHRs, confirming automated information extraction as one of the most mature LLM applications. Beyond data retrieval, LLMs are increasingly serving as active engines for hypothesis generation [<xref ref-type="bibr" rid="ref79">79</xref>,<xref ref-type="bibr" rid="ref80">80</xref>]. Their probabilistic structure allows them to synthesize vast, disparate datasets and infer latent correlations that traditional algorithms might overlook [<xref ref-type="bibr" rid="ref72">72</xref>]. In translational medicine, Yang developed AI-HOPE-TP53, a LLaMA 3-based conversational agent that facilitates pathway-centric analysis of clinical genomic data in early-onset CRC [<xref ref-type="bibr" rid="ref58">58</xref>]. By rapidly generating statistical outputs like survival curves and hazard ratios, this system accelerates hypothesis-driven research in precision oncology [<xref ref-type="bibr" rid="ref81">81</xref>]. The viability of this paradigm shift is further corroborated by Abdel-Rehim, who experimentally validated that LLM-driven pipelines can successfully identify novel, laboratory-verifiable synergistic drug combinations [<xref ref-type="bibr" rid="ref80">80</xref>]. Furthermore, hybrid LLM architectures are democratizing access to complex analytical tools in routine practice. Yang et al [<xref ref-type="bibr" rid="ref54">54</xref>] developed an early-stage CRC adenoma risk prediction model combining BGE-M3 semantic vector encoding with XGBoost algorithms. By enabling clinicians without specialized computational expertise to perform sophisticated risk stratification based on LLM-processed outputs, such models substantially reduce the administrative burden and facilitate a more patient-centered clinical workflow [<xref ref-type="bibr" rid="ref6">6</xref>]. The research-supportive functions of LLMs have also expanded into foundational scholarly activities, including knowledge synthesis and the drafting of study protocols, ethics materials, and preliminary manuscript sections [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref59">59</xref>]. However, the originality and factual accuracy of such artificial intelligence-generated scholarly content necessitate rigorous human oversight to ensure scientific integrity [<xref ref-type="bibr" rid="ref82">82</xref>].</p></sec><sec id="s4-3"><title>Limitations of LLMs and Future Directions</title><p>Current research on LLMs in the field of CRC predominantly focuses on textual data processing [<xref ref-type="bibr" rid="ref83">83</xref>]; investigations into other modalities, including CT images [<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref67">67</xref>], histopathological slides [<xref ref-type="bibr" rid="ref58">58</xref>], and bioinformatics data [<xref ref-type="bibr" rid="ref58">58</xref>], remain in their nascent stages, demonstrating suboptimal output precision and task stability. General-purpose LLMs (eg, ChatGPT and the LLaMA series), predominantly pretrained on public databases, frequently manifest deficiencies such as delayed knowledge base updates, insufficient coverage of CRC subspecialty knowledge, a propensity for hallucinations, and an absence of authoritative evidence-based support for pivotal clinical content [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref76">76</xref>,<xref ref-type="bibr" rid="ref84">84</xref>]. Conversely, although existing medical-domain-specific LLMs (eg, Med-PaLM 2, BioBERT, and ClinicalBERT) possess certain advantages in general medical tasks, their comprehensive performance in complex subspecialty tasks, such as the precision treatment of CRC, still lags behind that of large-parameter general-purpose models [<xref ref-type="bibr" rid="ref57">57</xref>]. More critically, the reliability and generalizability of currently well-developed diagnostic and decision-support tools are severely hindered by methodological flaws; existing evidence relies disproportionately on retrospective, single-center datasets lacking temporal or geographic stratification [<xref ref-type="bibr" rid="ref77">77</xref>]. This evaluative paradigm renders models highly susceptible to overfitting and training data leakage, thereby precipitating a drastic degradation in performance within real-world clinical environments [<xref ref-type="bibr" rid="ref85">85</xref>]. There remains a critical paucity of rigorous prospective, multicenter clinical validation data within this domain [<xref ref-type="bibr" rid="ref86">86</xref>,<xref ref-type="bibr" rid="ref87">87</xref>].</p><p>The risk-of-bias assessment revealed several recurring methodological weaknesses across study designs. Among diagnostic accuracy studies evaluated with QUADAS-2, the patient selection domain was the most common source of concern, with ratings of &#x201C;unclear&#x201D; or &#x201C;high&#x201D; largely attributable to unreported sampling procedures and potentially inappropriate exclusion criteria [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref57">57</xref>]. For nonrandomized intervention studies assessed with ROBINS-I, the principal limitations were inadequate adjustment for confounding variables [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref46">46</xref>-<xref ref-type="bibr" rid="ref48">48</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>] and the absence of blinded outcome assessment, both of which may bias effect estimates [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref51">51</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>,<xref ref-type="bibr" rid="ref58">58</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. Prediction model studies appraised with PROBAST generally performed well in the participants, predictors, and outcome domains but showed weaknesses in the analysis domain, including limited sample size, unexplained participant attrition, and insufficiently described handling of missing data [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. Collectively, these methodological limitations reduce the reliability of the current evidence base and constrain its translational applicability.</p><p>Beyond data-related constraints, the intrinsic technical vulnerabilities and compliance risks of LLMs pose substantial threats to clinical safety [<xref ref-type="bibr" rid="ref23">23</xref>]. The profound sensitivity of models to version iterations and prompt variations results in exceedingly poor reproducibility of outputs across multi-institutional settings [<xref ref-type="bibr" rid="ref77">77</xref>]. In the absence of specific instructional constraints, models are not only prone to hallucinations but may also exacerbate negative societal biases and stereotypes [<xref ref-type="bibr" rid="ref88">88</xref>]. Uncritical acceptance of these recommendations by clinicians may engender bias, subsequently precipitating critical diagnostic delays or inappropriate clinical interventions [<xref ref-type="bibr" rid="ref89">89</xref>]. Furthermore, constrained by the heterogeneity of patient requirements and the stringent governance of sensitive data, applications pertaining to patient follow-up and supportive care remain the most underdeveloped [<xref ref-type="bibr" rid="ref60">60</xref>]. Moreover, the pervasive absence of data privacy and information security protocols during the cloud-based deployment of open-source LLMs further impedes their clinical translation and real-world implementation [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>To address current technical bottlenecks, it is imperative to enhance model precision and reliability through future technological advancements [<xref ref-type="bibr" rid="ref90">90</xref>]. Multimodal integration is recognized as the predominant trajectory for next-generation technological development in this domain, offering the potential to transcend the limitations of unimodal text processing [<xref ref-type="bibr" rid="ref91">91</xref>]. Regarding optimization strategies, RAG technology emerges as an optimal solution for tailoring general-purpose models to subspecialty clinical scenarios [<xref ref-type="bibr" rid="ref9">9</xref>]. By interfacing with independent, verifiable, and authoritative subspecialty knowledge bases, RAG facilitates real-time knowledge updates, effectively enhances the concordance between model outputs and authoritative guidelines, substantially mitigates hallucinations, and endows models with robust interpretability [<xref ref-type="bibr" rid="ref92">92</xref>,<xref ref-type="bibr" rid="ref93">93</xref>]. Concurrently, prompt engineering (eg, instruction templates, few-shot learning, and chain-of-thought prompting) can rapidly augment the performance of general-purpose models in specific tasks, including pathological data extraction, treatment regimen recommendation, and follow-up protocol formulation, without altering underlying model weights [<xref ref-type="bibr" rid="ref94">94</xref>,<xref ref-type="bibr" rid="ref95">95</xref>].</p><p>Regarding clinical integration and ethical governance, future research priorities must pivot toward achieving real-world validity and safety [<xref ref-type="bibr" rid="ref96">96</xref>]. Primarily, prospective, multicenter clinical validations must be conducted for diagnostic and treatment planning applications, while patient follow-up and supportive care systems must be specifically developed to rectify deficiencies in full-cycle management [<xref ref-type="bibr" rid="ref97">97</xref>]. More crucially, LLMs cannot supplant medical professionals; their responsible clinical application must be strictly predicated on the establishment of a robust ethical governance framework [<xref ref-type="bibr" rid="ref98">98</xref>]. This necessitates the strict enforcement of their adjunctive role under continuous human supervision, concurrent with the resolution of data privacy issues and the assurance of foundational data quality [<xref ref-type="bibr" rid="ref99">99</xref>]. Ultimately, cross-disciplinary collaboration is imperative to delineate accountability, ensuring the synchronous evolution of governance frameworks and cutting-edge technologies [<xref ref-type="bibr" rid="ref82">82</xref>].</p></sec><sec id="s4-4"><title>Limitations of This Systematic Review</title><p>This review has several limitations. First, the majority of included studies were retrospective and single-center in design, and no prospective multicenter clinical trials establishing real-world LLM effectiveness in CRC care were identified. Only a minority conducted independent external validation, precluding confirmation of generalizability across diverse populations and institutions. Second, the rapid publication pace of LLM research means some recent developments may not have been captured despite the April 1, 2026 search cutoff. Third, restriction to English-language publications may introduce geographic bias. Fourth, several included studies evaluated proprietary commercial models such as GPT-4 and Claude, whose architectures and training data are not fully disclosed, introducing additional transparency and reproducibility concerns. No included study reported direct industry sponsorship for LLM evaluation. Finally, the search strategy was only cross-checked internally without formal external peer review, potentially leading to omission of a few unpublished or noncore journal studies. Inherent subjectivity in quality appraisal was mitigated through independent dual assessment, third-reviewer arbitration, and expert validation [<xref ref-type="bibr" rid="ref100">100</xref>,<xref ref-type="bibr" rid="ref101">101</xref>].</p></sec><sec id="s4-5"><title>Conclusions</title><p>This review establishes an integrative framework that synthesizes evidence across diverse study designs and LLM categories to compare their respective strengths and limitations in CRC care. Distinct from prior reviews that have addressed gastroenterology broadly or have been confined to a single study design, our work focuses specifically on the full-cycle CRC care continuum and, for the first time, comparatively evaluates general-purpose, domain-specific, and multimodal LLMs, thereby elucidating how prompt engineering and heterogeneous evaluation metrics shape reported outcomes. While our findings substantiate the clinical potential of LLMs, these results should be interpreted with caution, given the overall low quality of the available evidence. Most included studies failed to report key safeguards against bias&#x2014;such as blinding of outcome assessors, adequate adjustment for confounders, or the use of prospective, multicenter designs to validate model generalizability. Moreover, the substantial heterogeneity we observed across task types, LLM categories, prompt engineering strategies, reference standards, and outcome measures indicates that the performance advantages reported for any specific LLM are confined to the corresponding tasks and clinical scenarios and cannot be generalized. Future efforts should therefore prioritize the integration of LLMs into real-world clinical practice, which will require prospective, multicenter validation, a robust privacy-protection framework, and rigorous human oversight to mitigate bias. Against the backdrop of a rising global CRC burden and persistent disparities in health care resource allocation, this review provides an evidence base to inform the clinical translation, equitable scaling, and policy formulation surrounding LLM deployment in CRC care.</p></sec><sec id="s4-6"><title>Registration and Protocol</title><p>This systematic review was prospectively registered in the International Prospective Register of Systematic Reviews (PROSPERO) under registration number CRD420251248261. The review protocol is publicly accessible through the PROSPERO database. No separate protocol manuscript was published.</p><p>One amendment was made to the registered protocol: the literature search cutoff date was extended from November 1, 2025 to April 1, 2026, to capture the most recent publications prior to data synthesis. This amendment was implemented after the initial search had been completed and did not alter the review&#x2019;s eligibility criteria, synthesis methodology, or any other prespecified procedures. The narrative synthesis approach (SWiM), quality assessment tools (QUADAS-2, PROBAST, ROBINS-I), eligibility criteria, database selection, screening processes, and data extraction methods were all carried out as prespecified in the registered protocol. No other amendments were made.</p></sec></sec></body><back><ack><p>The authors sincerely thank Zhejiang Chinese Medical University and Hangzhou First People&#x2019;s Hospital for providing the academic research platform, professional literature resource support, and methodological guidance for the completion of this systematic review. This manuscript was originally drafted in Chinese and subsequently translated into English. During the preparation and translation process, the authors further used ChatGPT (OpenAI) to assist with English-language polishing. All AI-generated outputs were critically reviewed and manually edited by the authors, who take full responsibility for the accuracy and integrity of the final content. The authors declare the use of generative artificial intelligence (GAI) in the research and writing process. In accordance with the GAIDeT taxonomy (2025), GAI tools were used under full human supervision for idea generation, proofreading and editing, and translation. The GAI tools used were Gemini 3 and DeepSeek. Responsibility for the content and integrity of the final manuscript rests entirely with the authors. GAI tools are not listed as authors and do not bear responsibility for the final outcomes. This declaration is submitted under the collective responsibility of the authors.</p></ack><notes><sec><title>Funding</title><p>This work was supported by the Clinical Research Application Project of Zhejiang Provincial Medical and Health Science and Technology Program (grant number 2024KY190), the Hangzhou Municipal Medical and Health Science and Technology Program (grant number A20241859), and the Hangzhou Municipal Biomedical Special Project (grant number 2023WJC120).</p></sec><sec><title>Data Availability</title><p>The data that support the findings of this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>JL conceived the study, designed the methodology, and wrote the original draft. QF and WX contributed equally to data collection, analysis, and manuscript revision. HY, HT, and YL participated in data curation, validation, and discussion. All authors reviewed and approved the final manuscript. QF is the corresponding author. JL and WX contributed equally to this work.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CRC</term><def><p>colorectal cancer</p></def></def-item><def-item><term id="abb2">EHR</term><def><p>electronic health record</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb5">PICOS</term><def><p>Population, Intervention, Comparison, Outcome, Study design</p></def></def-item><def-item><term id="abb6">PRISMA</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb7">PRISMA-S</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses Literature Search Extension</p></def></def-item><def-item><term id="abb8">PROBAST</term><def><p>prediction model risk of bias assessment tool</p></def></def-item><def-item><term id="abb9">PROSPERO</term><def><p>prospective register of systematic reviews</p></def></def-item><def-item><term id="abb10">QUADAS-2</term><def><p>Quality Assessment of Diagnostic Accuracy Studies-2</p></def></def-item><def-item><term id="abb11">RAG</term><def><p>retrieval-augmented generation</p></def></def-item><def-item><term id="abb12">ROBINS-I</term><def><p>Risk of Bias in Nonrandomized Studies - of Interventions</p></def></def-item><def-item><term id="abb13">SWiM</term><def><p>synthesis without meta-analysis</p></def></def-item><def-item><term id="abb14">TNM</term><def><p>tumor&#x2013;node&#x2013;metastasis</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>M</given-names> </name></person-group><article-title>Global burden of colorectal cancer in 2022 and projections to 2050: incidence and mortality estimates from GLOBOCAN</article-title><source>BMC Cancer</source><year>2025</year><month>11</month><day>14</day><volume>25</volume><issue>1</issue><fpage>1770</fpage><pub-id pub-id-type="doi">10.1186/s12885-025-15138-0</pub-id><pub-id pub-id-type="medline">41239247</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yoshino</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ru&#x00ED;z-Garc&#x00ED;a</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Colorectal cancer</article-title><source>The Lancet</source><year>2024</year><month>07</month><volume>404</volume><issue>10449</issue><fpage>294</fpage><lpage>310</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(24)00360-X</pub-id><pub-id pub-id-type="medline">38278165</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sloss</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Abdul</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aboagyewah</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>Toward alleviating clinician documentation burden: a scoping review of burden reduction efforts</article-title><source>Appl Clin Inform</source><year>2024</year><month>05</month><volume>15</volume><issue>3</issue><fpage>446</fpage><lpage>455</lpage><pub-id pub-id-type="doi">10.1055/s-0044-1787007</pub-id><pub-id pub-id-type="medline">38839063</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Holmgren</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Apathy</surname><given-names>NC</given-names> </name><name name-style="western"><surname>Crews</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shanafelt</surname><given-names>T</given-names> </name></person-group><article-title>National trends in oncology specialists&#x2019; EHR inbox work, 2019-2022</article-title><source>J Natl Cancer Inst</source><year>2025</year><month>06</month><day>1</day><volume>117</volume><issue>6</issue><fpage>1253</fpage><lpage>1259</lpage><pub-id pub-id-type="doi">10.1093/jnci/djaf052</pub-id><pub-id pub-id-type="medline">40037649</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>EYT</given-names> </name><name name-style="western"><surname>Verlingue</surname><given-names>L</given-names> </name><name name-style="western"><surname>Aldea</surname><given-names>M</given-names> </name><etal/></person-group><article-title>ESMO guidance on the use of large language models in clinical practice (ELCAP)</article-title><source>Ann Oncol</source><year>2025</year><month>12</month><volume>36</volume><issue>12</issue><fpage>1447</fpage><lpage>1457</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2025.09.001</pub-id><pub-id pub-id-type="medline">41111032</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Alnassar</surname><given-names>SA</given-names> </name><name name-style="western"><surname>Avison</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Raman</surname><given-names>S</given-names> </name></person-group><article-title>Large language model applications for health information extraction in oncology: scoping review</article-title><source>JMIR Cancer</source><year>2025</year><month>03</month><day>28</day><volume>11</volume><issue>1</issue><fpage>e65984</fpage><pub-id pub-id-type="doi">10.2196/65984</pub-id><pub-id pub-id-type="medline">40153782</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peng</surname><given-names>W</given-names> </name><name name-style="western"><surname>feng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Evaluating AI in medicine: a comparative analysis of expert and ChatGPT responses to colorectal cancer questions</article-title><source>Sci Rep</source><year>2024</year><volume>14</volume><issue>1</issue><fpage>2840</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-52853-3</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>A</given-names> </name><name name-style="western"><surname>Steinke</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bocse</surname><given-names>HF</given-names> </name><name name-style="western"><surname>De Pastena</surname><given-names>M</given-names> </name></person-group><article-title>Dr. LLM will see you now: the ability of ChatGPT to provide geographically tailored colorectal cancer screening and surveillance recommendations</article-title><source>J Clin Med</source><year>2025</year><month>07</month><day>18</day><volume>14</volume><issue>14</issue><fpage>5101</fpage><pub-id pub-id-type="doi">10.3390/jcm14145101</pub-id><pub-id pub-id-type="medline">40725794</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gong</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Bang</surname><given-names>CS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>JJ</given-names> </name><etal/></person-group><article-title>Large language models in gastroenterology: systematic review</article-title><source>J Med Internet Res</source><year>2024</year><month>12</month><day>20</day><volume>26</volume><fpage>e66648</fpage><pub-id pub-id-type="doi">10.2196/66648</pub-id><pub-id pub-id-type="medline">39705703</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maida</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ramai</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mori</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>The role of generative language systems in increasing patient awareness of colon cancer screening</article-title><source>Endoscopy</source><year>2025</year><month>03</month><volume>57</volume><issue>3</issue><fpage>262</fpage><lpage>268</lpage><pub-id pub-id-type="doi">10.1055/a-2388-6084</pub-id><pub-id pub-id-type="medline">39142348</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Waldrup</surname><given-names>B</given-names> </name><name name-style="western"><surname>Velazquez-Villarreal</surname><given-names>E</given-names> </name></person-group><article-title>Conversational artificial intelligence for integrating social determinants, genomics, and clinical data in precision medicine: development and implementation study of the AI-HOPE-PM system</article-title><source>JMIR Bioinform Biotechnol</source><year>2025</year><month>10</month><day>10</day><volume>6</volume><fpage>e76553</fpage><pub-id pub-id-type="doi">10.2196/76553</pub-id><pub-id pub-id-type="medline">41342165</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pereyra</surname><given-names>L</given-names> </name><name name-style="western"><surname>Schlottmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Steinberg</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lasa</surname><given-names>J</given-names> </name></person-group><article-title>Colorectal cancer prevention: is chat generative pretrained transformer (Chat GPT) ready to assist physicians in determining appropriate screening and surveillance recommendations?</article-title><source>J Clin Gastroenterol</source><year>2024</year><volume>58</volume><issue>10</issue><fpage>1022</fpage><lpage>1027</lpage><pub-id pub-id-type="doi">10.1097/MCG.0000000000001979</pub-id><pub-id pub-id-type="medline">38319619</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>PW</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>RO</given-names> </name><etal/></person-group><article-title>Comparing ChatGPT3.5 and Bard recommendations for colonoscopy intervals: bridging the gap in healthcare settings</article-title><source>Endosc Int Open</source><year>2025</year><volume>13</volume><issue>CP</issue><fpage>a25865912</fpage><pub-id pub-id-type="doi">10.1055/a-2586-5912</pub-id><pub-id pub-id-type="medline">40611845</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>PW</given-names> </name><name name-style="western"><surname>Amini</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>RO</given-names> </name><etal/></person-group><article-title>ChatGPT4 outperforms endoscopists for determination of postcolonoscopy rescreening and surveillance recommendations</article-title><source>Clin Gastroenterol Hepatol</source><year>2024</year><month>09</month><volume>22</volume><issue>9</issue><fpage>1917</fpage><lpage>1925</lpage><pub-id pub-id-type="doi">10.1016/j.cgh.2024.04.022</pub-id><pub-id pub-id-type="medline">38729387</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nassar</surname><given-names>S</given-names> </name><name name-style="western"><surname>SharIf</surname><given-names>K</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Emerging applications of NLP and large language models in gastroenterology and hepatology: a systematic review</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1512824</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1512824</pub-id><pub-id pub-id-type="medline">39917263</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nosaka</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tanaka</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Usefulness of an artificial intelligence-based colonoscopy report generation support system</article-title><source>Clin Endosc</source><year>2025</year><month>03</month><volume>58</volume><issue>2</issue><fpage>327</fpage><lpage>330</lpage><pub-id pub-id-type="doi">10.5946/ce.2024.213</pub-id><pub-id pub-id-type="medline">40010702</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>B</given-names> </name><name name-style="western"><surname>Bath</surname><given-names>T</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Large language models for extracting histopathologic diagnoses of colorectal cancer and dysplasia from electronic health records</article-title><source>BMJ Open Gastroenterol</source><year>2025</year><month>09</month><day>18</day><volume>12</volume><issue>1</issue><fpage>e001896</fpage><pub-id pub-id-type="doi">10.1136/bmjgast-2025-001896</pub-id><pub-id pub-id-type="medline">40973184</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Br&#x00E4;utigam</surname><given-names>K</given-names> </name><name name-style="western"><surname>Baker</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Koelzer</surname><given-names>VH</given-names> </name><name name-style="western"><surname>Kather</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Graham</surname><given-names>TA</given-names> </name></person-group><article-title>Integrating artificial intelligence (AI) into colorectal cancer reporting</article-title><source>J Pathol</source><year>2026</year><month>04</month><volume>268</volume><issue>4</issue><fpage>367</fpage><lpage>382</lpage><pub-id pub-id-type="doi">10.1002/path.70029</pub-id><pub-id pub-id-type="medline">41588707</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Y&#x0131;lmaz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abbasl&#x0131;</surname><given-names>N</given-names> </name><name name-style="western"><surname>Tuna</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Comparison of artificial intelligence and multidisciplinary team recommendations in the management of colorectal cancer liver metastases</article-title><source>Sci Rep</source><year>2026</year><volume>16</volume><issue>1</issue><fpage>7278</fpage><pub-id pub-id-type="doi">10.1038/s41598-026-38449-z</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Comparison of large language models and expert multidisciplinary team decisions in colorectal cancer</article-title><source>BMJ Health Care Inform</source><year>2026</year><month>03</month><day>10</day><volume>33</volume><issue>1</issue><fpage>e101780</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2025-101780</pub-id><pub-id pub-id-type="medline">41806973</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Biesheuvel</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Workum</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Reuland</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models in critical care</article-title><source>J Intensive Med</source><year>2025</year><month>04</month><volume>5</volume><issue>2</issue><fpage>113</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.1016/j.jointm.2024.12.001</pub-id><pub-id pub-id-type="medline">40241839</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Emile</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Horesh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Freund</surname><given-names>M</given-names> </name><etal/></person-group><article-title>How appropriate are answers of online chat-based artificial intelligence (ChatGPT) to common questions on colon cancer?</article-title><source>Surgery</source><year>2023</year><month>11</month><volume>174</volume><issue>5</issue><fpage>1273</fpage><lpage>1275</lpage><pub-id pub-id-type="doi">10.1016/j.surg.2023.06.005</pub-id><pub-id pub-id-type="medline">37482439</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haltaufderheide</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ranisch</surname><given-names>R</given-names> </name></person-group><article-title>The ethics of ChatGPT in medicine and healthcare: a systematic review on large language models (LLMs)</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>8</day><volume>7</volume><issue>1</issue><fpage>183</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01157-x</pub-id><pub-id pub-id-type="medline">38977771</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>W</given-names> </name></person-group><article-title>A survey on medical competence evaluation benchmarks for large language models</article-title><source>Health Care Sci</source><year>2026</year><month>02</month><volume>5</volume><issue>1</issue><fpage>4</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1002/hcs2.70050</pub-id><pub-id pub-id-type="medline">41767169</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>S</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>C</given-names> </name><etal/></person-group><article-title>The performance of large language model-powered chatbots compared to oncology physicians on colorectal cancer queries</article-title><source>Int J Surg</source><year>2024</year><month>10</month><day>1</day><volume>110</volume><issue>10</issue><fpage>6509</fpage><lpage>6517</lpage><pub-id pub-id-type="doi">10.1097/JS9.0000000000001850</pub-id><pub-id pub-id-type="medline">38935100</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HG</given-names> </name></person-group><article-title>A comparative evaluation of chain-of-thought-based prompt engineering techniques for medical question answering</article-title><source>Comput Biol Med</source><year>2025</year><month>09</month><volume>196</volume><issue>Pt A</issue><fpage>110614</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.110614</pub-id><pub-id pub-id-type="medline">40602316</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kelley</surname><given-names>M</given-names> </name><name name-style="western"><surname>Samolyk-Mazzanti</surname><given-names>A</given-names> </name><name name-style="western"><surname>Visweswaran</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>An empirical evaluation of prompting strategies for large language models in zero-shot clinical natural language processing: algorithm development and validation study</article-title><source>JMIR Med Inform</source><year>2024</year><month>04</month><day>8</day><volume>12</volume><fpage>e55318</fpage><pub-id pub-id-type="doi">10.2196/55318</pub-id><pub-id pub-id-type="medline">38587879</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>DYZ</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>YB</given-names> </name><name name-style="western"><surname>Koh</surname><given-names>JTE</given-names> </name><etal/></person-group><article-title>ChatGPT on guidelines: providing contextual knowledge to GPT allows it to provide advice on appropriate colonoscopy intervals</article-title><source>J Gastroenterol Hepatol</source><year>2024</year><month>01</month><volume>39</volume><issue>1</issue><fpage>81</fpage><lpage>106</lpage><pub-id pub-id-type="doi">10.1111/jgh.16375</pub-id><pub-id pub-id-type="medline">37855067</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amugongo</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Mascheroni</surname><given-names>P</given-names> </name><name name-style="western"><surname>Brooks</surname><given-names>S</given-names> </name><name name-style="western"><surname>Doering</surname><given-names>S</given-names> </name><name name-style="western"><surname>Seidel</surname><given-names>J</given-names> </name></person-group><article-title>Retrieval augmented generation for large language models in healthcare: a systematic review</article-title><source>PLOS Digit Health</source><year>2025</year><month>06</month><volume>4</volume><issue>6</issue><fpage>e0000877</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000877</pub-id><pub-id pub-id-type="medline">40498738</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name></person-group><article-title>Adversarial prompt and fine-tuning attacks threaten medical large language models</article-title><source>Nat Commun</source><year>2025</year><month>10</month><day>9</day><volume>16</volume><issue>1</issue><fpage>9011</fpage><pub-id pub-id-type="doi">10.1038/s41467-025-64062-1</pub-id><pub-id pub-id-type="medline">41068092</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nayak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rangan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>JH</given-names> </name></person-group><article-title>Diagnostic reasoning prompts reveal the potential for large language model interpretability in medicine</article-title><source>NPJ Digit Med</source><year>2024</year><month>01</month><day>24</day><volume>7</volume><issue>1</issue><fpage>20</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01010-1</pub-id><pub-id pub-id-type="medline">38267608</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>CYK</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Kornblith</surname><given-names>AE</given-names> </name><name name-style="western"><surname>Butte</surname><given-names>AJ</given-names> </name></person-group><article-title>Evaluating the use of large language models to provide clinical recommendations in the emergency department</article-title><source>Nat Commun</source><year>2024</year><month>10</month><day>8</day><volume>15</volume><issue>1</issue><fpage>8236</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-52415-1</pub-id><pub-id pub-id-type="medline">39379357</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Large language models in lung cancer: systematic review</article-title><source>J Med Internet Res</source><year>2025</year><month>09</month><day>30</day><volume>27</volume><fpage>e74177</fpage><pub-id pub-id-type="doi">10.2196/74177</pub-id><pub-id pub-id-type="medline">41026980</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Holmes</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model integrations in cancer decision-making: a systematic review and meta-analysis</article-title><source>NPJ Digit Med</source><year>2025</year><month>07</month><day>17</day><volume>8</volume><issue>1</issue><fpage>450</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01824-7</pub-id><pub-id pub-id-type="medline">40676129</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clusmann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kolbinger</surname><given-names>FR</given-names> </name><name name-style="western"><surname>Muti</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>The future landscape of large language models in medicine</article-title><source>Commun Med (Lond)</source><year>2023</year><month>10</month><day>10</day><volume>3</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s43856-023-00370-1</pub-id><pub-id pub-id-type="medline">37816837</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>McKenzie</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Bossuyt</surname><given-names>PM</given-names> </name><etal/></person-group><article-title>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews</article-title><source>BMJ</source><year>2021</year><month>03</month><day>29</day><volume>372</volume><fpage>n71</fpage><pub-id pub-id-type="doi">10.1136/bmj.n71</pub-id><pub-id pub-id-type="medline">33782057</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rethlefsen</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Kirtley</surname><given-names>S</given-names> </name><name name-style="western"><surname>Waffenschmidt</surname><given-names>S</given-names> </name><etal/></person-group><article-title>PRISMA-S: an extension to the PRISMA statement for reporting literature searches in systematic reviews</article-title><source>Syst Rev</source><year>2021</year><month>01</month><day>26</day><volume>10</volume><issue>1</issue><fpage>39</fpage><pub-id pub-id-type="doi">10.1186/s13643-020-01542-z</pub-id><pub-id pub-id-type="medline">33499930</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Campbell</surname><given-names>M</given-names> </name><name name-style="western"><surname>McKenzie</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Sowden</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Synthesis without meta-analysis (SWiM) in systematic reviews: reporting guideline</article-title><source>BMJ</source><year>2020</year><month>01</month><day>16</day><volume>368</volume><fpage>l6890</fpage><pub-id pub-id-type="doi">10.1136/bmj.l6890</pub-id><pub-id pub-id-type="medline">31948937</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Levkovich</surname><given-names>I</given-names> </name></person-group><article-title>Exploring the efficacy and potential of large language models for depression: a systematic review</article-title><source>J Affect Disord</source><year>2025</year><month>02</month><day>15</day><volume>371</volume><fpage>234</fpage><lpage>244</lpage><pub-id pub-id-type="doi">10.1016/j.jad.2024.11.052</pub-id><pub-id pub-id-type="medline">39581383</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Whiting</surname><given-names>PF</given-names> </name><name name-style="western"><surname>Rutjes</surname><given-names>AWS</given-names> </name><name name-style="western"><surname>Westwood</surname><given-names>ME</given-names> </name><etal/></person-group><article-title>QUADAS-2: a revised tool for the quality assessment of diagnostic accuracy studies</article-title><source>Ann Intern Med</source><year>2011</year><month>10</month><day>18</day><volume>155</volume><issue>8</issue><fpage>529</fpage><lpage>536</lpage><pub-id pub-id-type="doi">10.7326/0003-4819-155-8-201110180-00009</pub-id><pub-id pub-id-type="medline">22007046</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moons</surname><given-names>KGM</given-names> </name><name name-style="western"><surname>Wolff</surname><given-names>RF</given-names> </name><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><etal/></person-group><article-title>PROBAST: a tool to assess risk of bias and applicability of prediction model studies: explanation and elaboration</article-title><source>Ann Intern Med</source><year>2019</year><month>01</month><day>1</day><volume>170</volume><issue>1</issue><fpage>W1</fpage><lpage>W33</lpage><pub-id pub-id-type="doi">10.7326/M18-1377</pub-id><pub-id pub-id-type="medline">30596876</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sterne</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Hern&#x00E1;n</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Reeves</surname><given-names>BC</given-names> </name><etal/></person-group><article-title>ROBINS-I: a tool for assessing risk of bias in non-randomised studies of interventions</article-title><source>BMJ</source><year>2016</year><month>10</month><day>12</day><volume>355</volume><fpage>i4919</fpage><pub-id pub-id-type="doi">10.1136/bmj.i4919</pub-id><pub-id pub-id-type="medline">27733354</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gorelik</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ghersin</surname><given-names>I</given-names> </name><name name-style="western"><surname>Maza</surname><given-names>I</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>A</given-names> </name></person-group><article-title>Harnessing language models for streamlined postcolonoscopy patient management: a novel approach</article-title><source>Gastrointest Endosc</source><year>2023</year><month>10</month><volume>98</volume><issue>4</issue><fpage>639</fpage><lpage>641</lpage><pub-id pub-id-type="doi">10.1016/j.gie.2023.06.025</pub-id><pub-id pub-id-type="medline">37385548</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alzaid</surname><given-names>E</given-names> </name><name name-style="western"><surname>Pergola</surname><given-names>G</given-names> </name><name name-style="western"><surname>Evans</surname><given-names>H</given-names> </name><name name-style="western"><surname>Snead</surname><given-names>D</given-names> </name><name name-style="western"><surname>Minhas</surname><given-names>F</given-names> </name></person-group><article-title>Large multimodal model-based standardisation of pathology reports with confidence and its prognostic significance</article-title><source>J Pathol Clin Res</source><year>2024</year><month>11</month><volume>10</volume><issue>6</issue><fpage>e70010</fpage><pub-id pub-id-type="doi">10.1002/2056-4538.70010</pub-id><pub-id pub-id-type="medline">39545631</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Atarere</surname><given-names>J</given-names> </name><name name-style="western"><surname>Naqvi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Haas</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Applicability of online chat-based artificial intelligence models to colorectal cancer screening</article-title><source>Dig Dis Sci</source><year>2024</year><month>03</month><volume>69</volume><issue>3</issue><fpage>791</fpage><lpage>797</lpage><pub-id pub-id-type="doi">10.1007/s10620-024-08274-3</pub-id><pub-id pub-id-type="medline">38267726</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ferber</surname><given-names>D</given-names> </name><name name-style="western"><surname>W&#x00F6;lflein</surname><given-names>G</given-names> </name><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><etal/></person-group><article-title>In-context learning enables multimodal large language models to classify cancer pathology images</article-title><source>Nat Commun</source><year>2024</year><month>11</month><day>21</day><volume>15</volume><issue>1</issue><fpage>10104</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-51465-9</pub-id><pub-id pub-id-type="medline">39572531</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaiser</surname><given-names>KN</given-names> </name><name name-style="western"><surname>Hughes</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>AD</given-names> </name><etal/></person-group><article-title>Accuracy and consistency of publicly available large language models as clinical decision support tools for the management of colon cancer</article-title><source>J Surg Oncol</source><year>2024</year><month>10</month><volume>130</volume><issue>5</issue><fpage>1104</fpage><lpage>1110</lpage><pub-id pub-id-type="doi">10.1002/jso.27821</pub-id><pub-id pub-id-type="medline">39155667</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kepez</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Ugur</surname><given-names>F</given-names> </name></person-group><article-title>Comparative evaluation of information quality on colon cancer for patients: a study of ChatGPT-4 and Google</article-title><source>Cureus</source><year>2024</year><month>11</month><volume>16</volume><issue>11</issue><fpage>e73989</fpage><pub-id pub-id-type="doi">10.7759/cureus.73989</pub-id><pub-id pub-id-type="medline">39703246</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The diagnostic ability of GPT-3.5 and GPT-4.0 in surgery: comparative analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>09</month><day>10</day><volume>26</volume><fpage>e54985</fpage><pub-id pub-id-type="doi">10.2196/54985</pub-id><pub-id pub-id-type="medline">39255016</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Large language model answers medical questions about standard pathology reports</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1402457</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1402457</pub-id><pub-id pub-id-type="medline">39359921</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>HB</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>HQ</given-names> </name><name name-style="western"><surname>Nei</surname><given-names>WL</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>YCRS</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name></person-group><article-title>Impact of large language models and vision deep learning models in predicting neoadjuvant rectal score for rectal cancer treated with neoadjuvant chemoradiation</article-title><source>BMC Med Imaging</source><year>2025</year><month>07</month><day>31</day><volume>25</volume><issue>1</issue><fpage>306</fpage><pub-id pub-id-type="doi">10.1186/s12880-025-01844-5</pub-id><pub-id pub-id-type="medline">40745280</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chizhikova</surname><given-names>M</given-names> </name><name name-style="western"><surname>L&#x00F3;pez-&#x00DA;beda</surname><given-names>P</given-names> </name><name name-style="western"><surname>Mart&#x00ED;n-Noguerol</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Automatic TNM staging of colorectal cancer radiology reports using pre-trained language models</article-title><source>Comput Methods Programs Biomed</source><year>2025</year><month>02</month><volume>259</volume><fpage>108515</fpage><pub-id pub-id-type="doi">10.1016/j.cmpb.2024.108515</pub-id><pub-id pub-id-type="medline">39602989</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Horesh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Emile</surname><given-names>SH</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Comparing the management recommendations of large language model and colorectal cancer multidisciplinary team: a pilot study</article-title><source>Dis Colon Rectum</source><year>2025</year><month>01</month><day>1</day><volume>68</volume><issue>1</issue><fpage>41</fpage><lpage>47</lpage><pub-id pub-id-type="doi">10.1097/DCR.0000000000003504</pub-id><pub-id pub-id-type="medline">39679608</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ji</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name></person-group><article-title>Early prediction of colorectal adenoma risk: leveraging large-language model for clinical electronic medical record data</article-title><source>Front Oncol</source><year>2025</year><volume>15</volume><fpage>1508455</fpage><pub-id pub-id-type="doi">10.3389/fonc.2025.1508455</pub-id><pub-id pub-id-type="medline">40444092</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>ZC</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>SP</given-names> </name><etal/></person-group><article-title>Comparative analysis of artificial intelligence tools for the dissemination of colorectal cancer screening guidelines: a novel perspective on early screening education</article-title><source>Int J Surg</source><year>2025</year><month>11</month><day>1</day><volume>111</volume><issue>11</issue><fpage>8616</fpage><lpage>8620</lpage><pub-id pub-id-type="doi">10.1097/JS9.0000000000002951</pub-id><pub-id pub-id-type="medline">40607944</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zeng</surname><given-names>L</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name></person-group><article-title>Guideline adherence in surgical decisions for T1 colorectal cancer after endoscopic resection: large language models vs clinicians</article-title><source>Int J Surg</source><year>2026</year><month>01</month><day>1</day><volume>112</volume><issue>1</issue><fpage>1886</fpage><lpage>1890</lpage><pub-id pub-id-type="doi">10.1097/JS9.0000000000003492</pub-id><pub-id pub-id-type="medline">40928382</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Evaluating large language models for information extraction from gastroscopy and colonoscopy reports through multi-strategy prompting</article-title><source>J Biomed Inform</source><year>2025</year><month>08</month><volume>168</volume><fpage>104844</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2025.104844</pub-id><pub-id pub-id-type="medline">40505790</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Waldrup</surname><given-names>B</given-names> </name><name name-style="western"><surname>Velazquez-Villarreal</surname><given-names>E</given-names> </name></person-group><article-title>Conversational AI agent for precision oncology: AI-HOPE-WNT integrates clinical and genomic data to investigate WNT pathway dysregulation in colorectal cancer</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1624797</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1624797</pub-id><pub-id pub-id-type="medline">40860720</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Leveraging large language and vision models for knowledge extraction from large-scale image-text colonoscopy records</article-title><source>Nat Biomed Eng</source><year>2025</year><month>09</month><day>16</day><pub-id pub-id-type="doi">10.1038/s41551-025-01500-x</pub-id><pub-id pub-id-type="medline">40958005</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sehgal</surname><given-names>NKR</given-names> </name><name name-style="western"><surname>Tonneau</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Effect of static vs. conversational AI-generated messages on colorectal cancer screening intent: a randomized controlled trial</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 10, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2507.08211</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schmutz</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sommer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sander</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model processing capabilities of ChatGPT 4.0 to generate molecular tumor board recommendations-a critical evaluation on real world data</article-title><source>Oncologist</source><year>2025</year><month>10</month><day>1</day><volume>30</volume><issue>10</issue><fpage>oyaf293</fpage><pub-id pub-id-type="doi">10.1093/oncolo/oyaf293</pub-id><pub-id pub-id-type="medline">40973166</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Massimi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Carlini</surname><given-names>L</given-names> </name><name name-style="western"><surname>Mori</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Large language model for interpreting the Paris classification of colorectal polyps</article-title><source>Endosc Int Open</source><year>2025</year><volume>13</volume><issue>CP</issue><fpage>a27030209</fpage><pub-id pub-id-type="doi">10.1055/a-2703-0209</pub-id><pub-id pub-id-type="medline">41079216</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maida</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mori</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fuccio</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Exploring ChatGPT effectiveness in addressing direct patient queries on colorectal cancer screening</article-title><source>Endosc Int Open</source><year>2025</year><volume>13</volume><issue>CP</issue><fpage>a25689416</fpage><pub-id pub-id-type="doi">10.1055/a-2568-9416</pub-id><pub-id pub-id-type="medline">40376022</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cai</surname><given-names>P</given-names> </name><collab>Artificial Intelligence Colorectal Cancer Research (AI-CORE) Working Group</collab></person-group><article-title>Multidimensional assessment of ChatGPT in colorectal cancer postoperative consultations: analysing response variations across critical clinical domains</article-title><source>Digit Health</source><year>2025</year><volume>11</volume><fpage>20552076251393297</fpage><pub-id pub-id-type="doi">10.1177/20552076251393297</pub-id><pub-id pub-id-type="medline">41181549</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ding</surname><given-names>L</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating ChatGPT&#x2019;s diagnostic potential for pathology images</article-title><source>Front Med</source><year>2025</year><volume>11</volume><fpage>1507203</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1507203</pub-id><pub-id pub-id-type="medline">41405831</pub-id></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Diaz</surname><given-names>FC</given-names> </name><name name-style="western"><surname>Waldrup</surname><given-names>B</given-names> </name><name name-style="western"><surname>Carranza</surname><given-names>FG</given-names> </name><name name-style="western"><surname>Manjarrez</surname><given-names>S</given-names> </name><name name-style="western"><surname>Velazquez-Villarreal</surname><given-names>E</given-names> </name></person-group><article-title>Artificial intelligence-enhanced precision medicine reveals prognostic impact of TGF-beta pathway alterations in FOLFOX-treated early-onset colorectal cancer among disproportionately affected populations</article-title><source>Int J Mol Sci</source><year>2025</year><month>09</month><day>17</day><volume>26</volume><issue>18</issue><fpage>9067</fpage><pub-id pub-id-type="doi">10.3390/ijms26189067</pub-id><pub-id pub-id-type="medline">41009631</pub-id></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chatziisaak</surname><given-names>D</given-names> </name><name name-style="western"><surname>Burri</surname><given-names>P</given-names> </name><name name-style="western"><surname>Sparn</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hahnloser</surname><given-names>D</given-names> </name><name name-style="western"><surname>Steffen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bischofberger</surname><given-names>S</given-names> </name></person-group><article-title>Concordance of ChatGPT artificial intelligence decision-making in colorectal cancer multidisciplinary meetings: retrospective study</article-title><source>BJS Open</source><year>2025</year><month>05</month><day>7</day><volume>9</volume><issue>3</issue><fpage>zraf040</fpage><pub-id pub-id-type="doi">10.1093/bjsopen/zraf040</pub-id><pub-id pub-id-type="medline">40331891</pub-id></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Garg</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Mau</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hubers</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Colon-Pilot: a generative AI tool for automated colonoscopy surveillance recommendations and 2024 ACG/ASGE quality benchmarking</article-title><source>Am J Gastroenterol</source><year>2026</year><month>04</month><day>1</day><volume>121</volume><issue>4</issue><fpage>964</fpage><lpage>973</lpage><pub-id pub-id-type="doi">10.14309/ajg.0000000000003946</pub-id><pub-id pub-id-type="medline">41665228</pub-id></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Baek</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Ryu</surname><given-names>HS</given-names> </name><etal/></person-group><article-title>Using large language models for clinical staging of colorectal cancer from imaging reports: a pilot study</article-title><source>Ann Surg Treat Res</source><year>2025</year><month>11</month><volume>109</volume><issue>5</issue><fpage>318</fpage><lpage>327</lpage><pub-id pub-id-type="doi">10.4174/astr.2025.109.5.318</pub-id><pub-id pub-id-type="medline">41255477</pub-id></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bi</surname><given-names>W</given-names> </name><name name-style="western"><surname>Lv</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name></person-group><article-title>An entity extraction pipeline for medical text records using large language models: analytical study</article-title><source>J Med Internet Res</source><year>2024</year><month>03</month><day>29</day><volume>26</volume><fpage>e54580</fpage><pub-id pub-id-type="doi">10.2196/54580</pub-id><pub-id pub-id-type="medline">38551633</pub-id></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>MY</given-names> </name><etal/></person-group><article-title>Towards a general-purpose foundation model for computational pathology</article-title><source>Nat Med</source><year>2024</year><month>03</month><volume>30</volume><issue>3</issue><fpage>850</fpage><lpage>862</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-02857-3</pub-id><pub-id pub-id-type="medline">38504018</pub-id></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model trained on clinical oncology data predicts cancer progression</article-title><source>NPJ Digit Med</source><year>2025</year><month>07</month><day>2</day><volume>8</volume><issue>1</issue><fpage>397</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01780-2</pub-id><pub-id pub-id-type="medline">40604229</pub-id></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tariq</surname><given-names>R</given-names> </name><name name-style="western"><surname>Malik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khanna</surname><given-names>S</given-names> </name></person-group><article-title>Evolving landscape of large language models: an evaluation of ChatGPT and Bard in answering patient queries on colonoscopy</article-title><source>Gastroenterology</source><year>2024</year><month>01</month><volume>166</volume><issue>1</issue><fpage>220</fpage><lpage>221</lpage><pub-id pub-id-type="doi">10.1053/j.gastro.2023.08.033</pub-id><pub-id pub-id-type="medline">37634736</pub-id></nlm-citation></ref><ref id="ref74"><label>74</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maida</surname><given-names>M</given-names> </name><name name-style="western"><surname>Celsa</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>LHS</given-names> </name><etal/></person-group><article-title>The application of large language models in gastroenterology: a review of the literature</article-title><source>Cancers (Basel)</source><year>2024</year><month>09</month><day>28</day><volume>16</volume><issue>19</issue><fpage>3328</fpage><pub-id pub-id-type="doi">10.3390/cancers16193328</pub-id><pub-id pub-id-type="medline">39409948</pub-id></nlm-citation></ref><ref id="ref75"><label>75</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jonnagaddala</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shulajkovska</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gradi&#x0161;ek</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Multimodal analysis of whole slide images in colorectal cancer</article-title><source>NPJ Digit Med</source><year>2025</year><month>11</month><day>24</day><volume>8</volume><issue>1</issue><fpage>719</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-02095-y</pub-id><pub-id pub-id-type="medline">41286436</pub-id></nlm-citation></ref><ref id="ref76"><label>76</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref77"><label>77</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Giuffr&#x00E8;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kresevic</surname><given-names>S</given-names> </name><name name-style="western"><surname>Pugliese</surname><given-names>N</given-names> </name><name name-style="western"><surname>You</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name></person-group><article-title>Optimizing large language models in digestive disease: strategies and challenges to improve clinical outcomes</article-title><source>Liver Int</source><year>2024</year><month>09</month><volume>44</volume><issue>9</issue><fpage>2114</fpage><lpage>2124</lpage><pub-id pub-id-type="doi">10.1111/liv.15974</pub-id><pub-id pub-id-type="medline">38819632</pub-id></nlm-citation></ref><ref id="ref78"><label>78</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fraile Navarro</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ijaz</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rezazadegan</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Clinical named entity recognition and relation extraction using natural language processing of medical free text: a systematic review</article-title><source>Int J Med Inform</source><year>2023</year><month>09</month><volume>177</volume><fpage>105122</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2023.105122</pub-id><pub-id pub-id-type="medline">37295138</pub-id></nlm-citation></ref><ref id="ref79"><label>79</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shariatmadari</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Xiong</surname><given-names>G</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>A</given-names> </name></person-group><article-title>Embracing foundation models for advancing scientific discovery</article-title><conf-name>2024 IEEE International Conference on Big Data (BigData)</conf-name><conf-date>Dec 15-18, 2024</conf-date><conf-loc>Washington, DC, USA</conf-loc><fpage>1746</fpage><lpage>1755</lpage><pub-id pub-id-type="doi">10.1109/BigData62323.2024.10825618</pub-id></nlm-citation></ref><ref id="ref80"><label>80</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdel-Rehim</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zenil</surname><given-names>H</given-names> </name><name name-style="western"><surname>Orhobor</surname><given-names>O</given-names> </name><etal/></person-group><article-title>Scientific hypothesis generation by large language models: laboratory validation in breast cancer treatment</article-title><source>J R Soc Interface</source><year>2025</year><month>06</month><volume>22</volume><issue>227</issue><fpage>20240674</fpage><pub-id pub-id-type="doi">10.1098/rsif.2024.0674</pub-id><pub-id pub-id-type="medline">40462712</pub-id></nlm-citation></ref><ref id="ref81"><label>81</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hadjiiski</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gormley</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Outcome prediction using multi-modal information: integrating large language model-extracted clinical information and image analysis</article-title><source>Cancers (Basel)</source><year>2024</year><month>06</month><day>29</day><volume>16</volume><issue>13</issue><fpage>2402</fpage><pub-id pub-id-type="doi">10.3390/cancers16132402</pub-id><pub-id pub-id-type="medline">39001463</pub-id></nlm-citation></ref><ref id="ref82"><label>82</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kocak</surname><given-names>Z</given-names> </name></person-group><article-title>Publication ethics in the era of artificial intelligence</article-title><source>J Korean Med Sci</source><year>2024</year><month>08</month><day>26</day><volume>39</volume><issue>33</issue><fpage>e249</fpage><pub-id pub-id-type="doi">10.3346/jkms.2024.39.e249</pub-id><pub-id pub-id-type="medline">39189714</pub-id></nlm-citation></ref><ref id="ref83"><label>83</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Parsa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Swanson</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Large language models in oncology: a review</article-title><source>BMJ Oncol</source><year>2025</year><volume>4</volume><issue>1</issue><fpage>e000759</fpage><pub-id pub-id-type="doi">10.1136/bmjonc-2025-000759</pub-id><pub-id pub-id-type="medline">40519217</pub-id></nlm-citation></ref><ref id="ref84"><label>84</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref><ref id="ref85"><label>85</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Rong</surname><given-names>R</given-names> </name><etal/></person-group><article-title>A critical assessment of using ChatGPT for extracting structured data from clinical notes</article-title><source>NPJ Digit Med</source><year>2024</year><month>05</month><day>1</day><volume>7</volume><issue>1</issue><fpage>106</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01079-8</pub-id><pub-id pub-id-type="medline">38693429</pub-id></nlm-citation></ref><ref id="ref86"><label>86</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Cruz Rivera</surname><given-names>S</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI extension</article-title><source>Nat Med</source><year>2020</year><month>09</month><volume>26</volume><issue>9</issue><fpage>1364</fpage><lpage>1374</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-1034-x</pub-id><pub-id pub-id-type="medline">32908283</pub-id></nlm-citation></ref><ref id="ref87"><label>87</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Group</surname><given-names>S</given-names> </name></person-group><article-title>DECIDE-AI: new reporting guidelines to bridge the development-to-implementation gap in clinical artificial intelligence</article-title><source>Nat Med</source><year>2021</year><month>02</month><volume>27</volume><issue>2</issue><fpage>186</fpage><lpage>187</lpage><pub-id pub-id-type="doi">10.1038/s41591-021-01229-5</pub-id><pub-id pub-id-type="medline">33526932</pub-id></nlm-citation></ref><ref id="ref88"><label>88</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lehman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Suzgun</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study</article-title><source>Lancet Digit Health</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e12</fpage><lpage>e22</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00225-X</pub-id><pub-id pub-id-type="medline">38123252</pub-id></nlm-citation></ref><ref id="ref89"><label>89</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tschandl</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rinner</surname><given-names>C</given-names> </name><name name-style="western"><surname>Apalla</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Human-computer collaboration for skin cancer recognition</article-title><source>Nat Med</source><year>2020</year><month>08</month><volume>26</volume><issue>8</issue><fpage>1229</fpage><lpage>1234</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-0942-0</pub-id><pub-id pub-id-type="medline">32572267</pub-id></nlm-citation></ref><ref id="ref90"><label>90</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wornow</surname><given-names>M</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Thapa</surname><given-names>R</given-names> </name><etal/></person-group><article-title>The shaky foundations of large language models and foundation models for electronic health records</article-title><source>NPJ Digit Med</source><year>2023</year><month>07</month><day>29</day><volume>6</volume><issue>1</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.1038/s41746-023-00879-8</pub-id><pub-id pub-id-type="medline">37516790</pub-id></nlm-citation></ref><ref id="ref91"><label>91</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Abad</surname><given-names>ZSH</given-names> </name><etal/></person-group><article-title>Foundation models for generalist medical artificial intelligence</article-title><source>Nature New Biol</source><year>2023</year><month>04</month><volume>616</volume><issue>7956</issue><fpage>259</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-05881-4</pub-id><pub-id pub-id-type="medline">37045921</pub-id></nlm-citation></ref><ref id="ref92"><label>92</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zakka</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chaurasia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title><source>NEJM AI</source><year>2024</year><month>02</month><volume>1</volume><issue>2</issue><pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id><pub-id pub-id-type="medline">38343631</pub-id></nlm-citation></ref><ref id="ref93"><label>93</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Artsi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Korfiatis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Large language models in real-world clinical workflows: a systematic review of applications and implementation</article-title><source>Front Digit Health</source><year>2025</year><volume>7</volume><fpage>1659134</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2025.1659134</pub-id><pub-id pub-id-type="medline">41098649</pub-id></nlm-citation></ref><ref id="ref94"><label>94</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Carignan</surname><given-names>D</given-names> </name><name name-style="western"><surname>Horvitz</surname><given-names>E</given-names> </name></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></nlm-citation></ref><ref id="ref95"><label>95</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Schuurmans</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Chain-of-thought prompting elicits reasoning in large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 10, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2201.11903</pub-id></nlm-citation></ref><ref id="ref96"><label>96</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghassemi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Naumann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schulam</surname><given-names>P</given-names> </name><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>IY</given-names> </name><name name-style="western"><surname>Ranganath</surname><given-names>R</given-names> </name></person-group><article-title>A review of challenges and opportunities in machine learning for health</article-title><source>AMIA Jt Summits Transl Sci Proc</source><year>2020</year><volume>2020</volume><fpage>191</fpage><lpage>200</lpage><pub-id pub-id-type="medline">32477638</pub-id></nlm-citation></ref><ref id="ref97"><label>97</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cervantes</surname><given-names>A</given-names> </name><name name-style="western"><surname>Adam</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rosell&#x00F3;</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Metastatic colorectal cancer: ESMO clinical practice guideline for diagnosis, treatment and follow-up</article-title><source>Ann Oncol</source><year>2023</year><month>01</month><volume>34</volume><issue>1</issue><fpage>10</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1016/j.annonc.2022.10.003</pub-id><pub-id pub-id-type="medline">36307056</pub-id></nlm-citation></ref><ref id="ref98"><label>98</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Char</surname><given-names>DS</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Magnus</surname><given-names>D</given-names> </name></person-group><article-title>Implementing machine learning in health care - addressing ethical challenges</article-title><source>N Engl J Med</source><year>2018</year><month>03</month><day>15</day><volume>378</volume><issue>11</issue><fpage>981</fpage><lpage>983</lpage><pub-id pub-id-type="doi">10.1056/NEJMp1714229</pub-id><pub-id pub-id-type="medline">29539284</pub-id></nlm-citation></ref><ref id="ref99"><label>99</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amann</surname><given-names>J</given-names> </name><name name-style="western"><surname>Blasimme</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vayena</surname><given-names>E</given-names> </name><name name-style="western"><surname>Frey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Madai</surname><given-names>VI</given-names> </name><collab>Precise4Q consortium</collab></person-group><article-title>Explainability for artificial intelligence in healthcare: a multidisciplinary perspective</article-title><source>BMC Med Inform Decis Mak</source><year>2020</year><month>11</month><day>30</day><volume>20</volume><issue>1</issue><fpage>310</fpage><pub-id pub-id-type="doi">10.1186/s12911-020-01332-6</pub-id><pub-id pub-id-type="medline">33256715</pub-id></nlm-citation></ref><ref id="ref100"><label>100</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ahmed</surname><given-names>M</given-names> </name><name name-style="western"><surname>Whicher</surname><given-names>D</given-names></name><name name-style="western"><surname>Israni</surname><given-names>ST</given-names> </name></person-group><source>Artificial Intelligence in Health Care: The Hope, the Hype, the Promise, the Peril</source><year>2023</year><access-date>2026-03-18</access-date><publisher-name>National Academy of Medicine</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="http://www.ncbi.nlm.nih.gov/books/NBK605955">http://www.ncbi.nlm.nih.gov/books/NBK605955</ext-link></comment></nlm-citation></ref><ref id="ref101"><label>101</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Davenport</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kalakota</surname><given-names>R</given-names> </name></person-group><article-title>The potential for artificial intelligence in healthcare</article-title><source>Future Healthc J</source><year>2019</year><month>06</month><volume>6</volume><issue>2</issue><fpage>94</fpage><lpage>98</lpage><pub-id pub-id-type="doi">10.7861/futurehosp.6-2-94</pub-id><pub-id pub-id-type="medline">31363513</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Detailed literature search strategies.</p><media xlink:href="jmir_v28i1e89862_app1.pdf" xlink:title="PDF File, 104 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Methodological classification, appraisal tools, and evaluation metrics of the included studies.</p><media xlink:href="jmir_v28i1e89862_app2.pdf" xlink:title="PDF File, 108 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Documentation of framework-preserving adaptations to quality appraisal tools.</p><media xlink:href="jmir_v28i1e89862_app3.pdf" xlink:title="PDF File, 145 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Prompt engineering strategies and application scenarios in the included studies.</p><media xlink:href="jmir_v28i1e89862_app4.pdf" xlink:title="PDF File, 87 KB"/></supplementary-material><supplementary-material id="app5"><label>Checklist 1</label><p>PRISMA&#x2010;S checklist.</p><media xlink:href="jmir_v28i1e89862_app5.pdf" xlink:title="PDF File, 129 KB"/></supplementary-material></app-group></back></article>