<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e95877</article-id><article-id pub-id-type="doi">10.2196/95877</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Assessing Eligibility for Anticancer Drug Health Insurance Reimbursement Using Large Language Models: Benchmark Development and Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Seo</surname><given-names>Junhyuk</given-names></name><degrees>RN, MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Kim</surname><given-names>Taerim</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Kim</surname><given-names>Ju-Hyun</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib></contrib-group><aff id="aff1"><institution>Healthcare Research Institute, ETOILE Inc</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff2"><institution>Department of Digital Health, Samsung Advanced Institute of Health Sciences and Technology (SAIHST), Sungkyunkwan University</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff3"><institution>Department of Emergency Medicine, Sungkyunkwan University School of Medicine, Samsung Medical Center</institution><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><aff id="aff4"><institution>Department of Obstetrics and Gynecology, University of Ulsan College of Medicine, Asan Medical Center</institution><addr-line>88, Olympic-ro 43-gil, Songpa-gu</addr-line><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Kodan</surname><given-names>Amol</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Bursa</surname><given-names>Okan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Cao</surname><given-names>Yuchen</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Zhi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Ju-Hyun Kim, MD, PhD, Department of Obstetrics and Gynecology, University of Ulsan College of Medicine, Asan Medical Center, 88, Olympic-ro 43-gil, Songpa-gu, Seoul, 05505, Republic of Korea, 82 10-9668-6227; <email>smilekako@naver.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>15</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e95877</elocation-id><history><date date-type="received"><day>22</day><month>03</month><year>2026</year></date><date date-type="rev-recd"><day>22</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>25</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Junhyuk Seo, Taerim Kim, Ju-Hyun Kim. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 15.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e95877"/><abstract><sec><title>Background</title><p>Administrative costs in the health care system are driven in part by complex insurance eligibility determinations. Large language models (LLMs) are increasingly used for health insurance&#x2013;related queries, yet their reliability for structured logical reasoning over coverage criteria has not been systematically evaluated.</p></sec><sec><title>Objective</title><p>This study aimed to develop a benchmark for anticancer drug reimbursement eligibility determination and evaluate whether LLMs can reliably perform eligibility verification.</p></sec><sec sec-type="methods"><title>Methods</title><p>We constructed a benchmark based on South Korea&#x2019;s National Health Insurance reimbursement guidelines for 3 gynecologic cancers (cervical, uterine, and ovarian), using a tristate adjudication framework (eligible, ineligible, and undeterminable). Three gynecologic oncology experts and a utilization review nurse validated the benchmark. Six LLMs from 3 providers (Anthropic, Google, and OpenAI) were evaluated using the official guideline document as input. Each case was evaluated 3 times per model, with final predictions determined by majority vote, and performance was compared across the 3 outcome classes.</p></sec><sec sec-type="results"><title>Results</title><p>The benchmark comprises 74 anticancer regimens with 222 cases. Overall verification accuracy ranged from 77.9% to 88.7% across the 6 models. Eligible and ineligible cases were classified with high recall (86.5%&#x2010;98.6%), but undeterminable cases showed a marked decline across all models (44.6%&#x2010;70.3%). Performance varied by cancer type, with uterine cancer showing the lowest undeterminable recall (16.7%), corresponding to the highest guideline complexity. Undeterminable cases were predominantly misclassified as eligible rather than ineligible. The tristate framework enabled logic-based error analysis of 235 incorrect predictions, revealing information gap-filling as the dominant failure pattern (n=196, 83.4%), followed by criterion misapplication (n=20, 8.5%) and false uncertainty (n=19, 8.1%). Subtype analysis indicated that information gap-filling errors were concentrated at hierarchical elements of the guideline. Sensitivity analyses showed that converting the guideline document to structured text degraded performance, while web search&#x2013;enabled condition (0%&#x2010;3.2% tool invocation across models) and structure-guided prompting did not produce significant changes from baseline.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In this benchmark, LLMs classified clearly eligible and ineligible cases with relatively high recall but showed limited reliability on undeterminable cases. The dominant error pattern was information gap-filling, in which models inferred eligibility rather than withholding judgment. These findings indicate that LLMs, in their current form, should be deployed as supervised decision-support tools rather than as independent adjudicators in reimbursement review.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>health insurance</kwd><kwd>reimbursement</kwd><kwd>eligibility verification</kwd><kwd>anticancer drugs</kwd><kwd>clinical decision support</kwd><kwd>benchmark</kwd><kwd>natural language processing</kwd><kwd>gynecologic cancer</kwd><kwd>National Health Insurance</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>In the United States, billing- and insurance-related activities were estimated to cost approximately US $471 billion annually, representing roughly 18% of national health expenditure [<xref ref-type="bibr" rid="ref1">1</xref>]. At the provider level, billing- and insurance-related costs account for 3% to 25% of professional revenue depending on the type of clinical encounter [<xref ref-type="bibr" rid="ref2">2</xref>]. This administrative burden poses significant negative impacts for both patients and clinicians [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. Although health systems and payer models vary worldwide, billing processes follow a common pathway&#x2014;from eligibility verification and coding to claim submission and rework&#x2014;across countries with diverse payer structures [<xref ref-type="bibr" rid="ref5">5</xref>]. These burdens persist even in single-payer systems, where centralized rule-based reimbursement introduces a different form of administrative complexity.</p><p>In South Korea, the entire population is enrolled in the National Health Insurance (NHI) by law and relies on predefined coverage rules and eligibility criteria [<xref ref-type="bibr" rid="ref6">6</xref>]. Under this system, the National Health Insurance Service functions as the single insurer, while the Health Insurance Review &#x0026; Assessment Service (HIRA) conducts centralized claims review and quality assessment [<xref ref-type="bibr" rid="ref7">7</xref>]. Reimbursement decisions are made through a retrospective claims review process, in which submitted claims are evaluated for compliance with detailed coverage rules [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Under this retrospective review structure, eligibility determination becomes a high-stakes post hoc decision, where errors can directly lead to claim rejection, payment adjustment, and downstream administrative burden.</p><p>As anticancer agents continue to be developed and covered by the NHI, national expenditure on these drugs surged by 168.2% from 2013 to 2022, reaching US $982 million in 2022 [<xref ref-type="bibr" rid="ref10">10</xref>]. Alongside this rapid cost increase, eligibility criteria for anticancer drugs have become increasingly complicated, requiring simultaneous assessment of multiple clinical variables&#x2014;such as tumor type, prior regimen sequences, biomarker thresholds, and performance scores&#x2014;connected by nested AND/OR conditions [<xref ref-type="bibr" rid="ref11">11</xref>]. This combinatorial structure makes manual review prone to omission and misinterpretation, particularly when multiple interdependent conditions must be evaluated simultaneously.</p><p>A recent report found that users of ChatGPT in the United States alone send 1.6 to 1.9 million health insurance&#x2013;related messages each week to understand coverage and navigate claims and denials [<xref ref-type="bibr" rid="ref12">12</xref>]. Despite this high level of use, large language models (LLMs) often produce hallucinations and other errors that can lead to critical mistakes, especially in medical domains [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. When LLMs have been applied to improve the understandability of clinical guidelines, unintended omissions and changes in meaning were identified in up to 20% of revised subsections [<xref ref-type="bibr" rid="ref15">15</xref>]. If even surface-level revisions can introduce errors, the risk may be amplified when LLMs must reason over the logical structure of guidelines to reach coverage determination.</p><p>Recent studies have shown that LLMs can support reimbursement-related tasks, such as detecting clinical conditions for appropriate coding [<xref ref-type="bibr" rid="ref16">16</xref>], and automating <italic>International Classification of Diseases</italic> code assignment from clinical documentation [<xref ref-type="bibr" rid="ref17">17</xref>]. However, these applications focus on information extraction from clinical notes. Determining reimbursement eligibility, by contrast, requires structured logical reasoning in which multiple clinical attributes must be jointly assessed against complex rule sets, where a single incorrect inference can invalidate the entire decision. Moreover, in clinical practice, available documentation is often incomplete [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>], and a reliable system must also recognize when the evidence is insufficient to reach a determination rather than forcing a binary decision. Yet, no standardized benchmark exists to evaluate these capabilities under real-world insurance review constraints.</p><p>To address this gap, we developed a benchmark for anticancer drug reimbursement eligibility based on South Korea&#x2019;s national guidelines. Our benchmark formalizes the condition-level adjudication logic that clinicians and utilization review nurses routinely apply when evaluating incomplete clinical evidence against coverage rules. It covers 3 gynecologic cancers&#x2014;cervical, uterine, and ovarian&#x2014;and includes eligible, ineligible, and undeterminable cases to assess not only correctness but also whether models can recognize when available evidence is insufficient to support a determination. Using this benchmark, we evaluated LLM reliability for reimbursement eligibility adjudication under incomplete information.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Benchmark Development</title><p>The study design is summarized in <xref ref-type="fig" rid="figure1">Figure 1</xref>. This study was reported in accordance with the TRIPOD-LLM (Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis-Large Language Model) [<xref ref-type="bibr" rid="ref20">20</xref>]. A completed checklist is provided in <xref ref-type="supplementary-material" rid="app6">Checklist 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Study design overview. The benchmark comprises 222 cases derived from national reimbursement guidelines for 3 gynecologic cancers (74 regimens &#x00D7; 3 outcome classes). Validation was performed by 3 gynecologic oncology experts and an independent utilization review nurse. Six models were evaluated using the finalized benchmark. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e95877_fig01.png"/></fig><p>We developed the benchmark using the HIRA reimbursement review guidelines for anticancer drugs, version dated February 1, 2026. Three major gynecologic cancers&#x2014;cervical cancer, uterine cancer, and ovarian cancer&#x2014;were included. For each cancer type, reimbursement criteria were translated into structured clinical attributes representing heterogeneous patient scenarios. Candidate benchmark items were created by a researcher with expertise in medical informatics (JS).</p><p>The benchmark was constructed within a tristate adjudication framework, including eligible, ineligible, and undeterminable outcomes for each anticancer regimen. This outcome assignment is based on 3 different condition-level states: met, not met, and unevaluable. We use &#x201C;not met&#x201D; to denote a condition whose required value is present but violates the criterion, and &#x201C;unevaluable&#x201D; to denote a condition that cannot be assessed because the required information is absent (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The unevaluable state was included to reflect real-world documentation contexts in which information required for applying a coverage criterion may be absent, incomplete, or inaccessible.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Tristate adjudication framework for benchmark outcome assignment. Each required condition for a regimen was classified as met, not met, or unevaluable. Eligible cases met all required conditions. Ineligible cases included at least 1 explicitly not-met condition. Undeterminable cases included no explicitly not-met condition but at least 1 unevaluable condition due to missing information.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e95877_fig02.png"/></fig><p>For instance, a case is classified as ineligible when 1 or more conditions are explicitly not met, regardless of whether other conditions are unevaluable. A case is classified as undeterminable when no condition is explicitly not met, yet 1 or more conditions remain unevaluable. To control task difficulty, ineligible and undeterminable cases were designed as near-miss scenarios containing only 1 to 2 conditions not met or unevaluable. The detailed definitions of the outcomes are shown in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Definitions of eligibility outcomes used in the benchmark, with illustrative examples from cervical cancer topotecan (single use).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Outcome</td><td align="left" valign="bottom">Definition</td><td align="left" valign="bottom">Example scenario</td><td align="left" valign="bottom">Explanation</td></tr></thead><tbody><tr><td align="left" valign="top">Eligible</td><td align="left" valign="top">All required conditions are met</td><td align="left" valign="top">Clinical indication=recurrent; line of therapy=second</td><td align="left" valign="top">All guideline conditions are satisfied.</td></tr><tr><td align="left" valign="top">Ineligible</td><td align="left" valign="top">&#x2265;1 condition is explicitly not met, regardless of whether other conditions are unevaluable</td><td align="left" valign="top">Clinical indication=recurrent; Line of therapy=first</td><td align="left" valign="top">Line of therapy is first line. The condition is present but violates the criterion (&#x201C;not met&#x201D;).</td></tr><tr><td align="left" valign="top">Undeterminable</td><td align="left" valign="top">No condition is explicitly not met, but &#x2265;1 condition is unevaluable due to absent clinical information</td><td align="left" valign="top">Clinical indication=recurrent</td><td align="left" valign="top">Line of therapy is required but absent. No condition is violated, so the case cannot be classified as ineligible, yet a determination cannot be reached (&#x201C;unevaluable&#x201D;).</td></tr></tbody></table></table-wrap></sec><sec id="s2-2"><title>Benchmark Validation</title><p>The benchmark was reviewed by 3 gynecologic oncology experts, including 2 nurse practitioners and 1 physician. Validation focused on the clinical plausibility of the patient scenarios, the appropriateness of attribute combinations, and the consistency of the reference answers.</p><p>For instance, the experts identified that palliative regimens inherently target patients with recurrent or metastatic disease, meaning that clinical indication is implicitly satisfied, and its removal alone does not produce a valid undeterminable case. For such regimens, additional clinical attributes had to be removed to construct undeterminable scenarios. The review also identified certain regimens that are permissively reimbursed in clinical practice despite not strictly meeting the guideline criteria. For these cases, reference answers were assigned based on the guideline criteria.</p><p>After clinical expert review, all benchmark cases were independently reviewed by a utilization review nurse, and discrepancies were resolved through consensus while preserving the balanced class structure. Interrater agreement between experts and the utilization review nurse was assessed using Cohen &#x03BA;. This multidisciplinary validation approach reflects real-world reimbursement review processes, which involve both clinical and administrative expertise.</p></sec><sec id="s2-3"><title>LLM Evaluation</title><p>We evaluated 6 LLMs from 3 providers: Gemini 3.1 Pro (gemini-3.1-pro-preview) and Gemini 3 Flash (gemini-3-flash-preview) from Google; Claude Opus 4.6 (claude-opus-4&#x2010;6) and Claude Sonnet 4.6 (claude-sonnet-4&#x2010;6) from Anthropic; and GPT-5.4 (gpt-5.4-2026-03-05) and GPT-5 Mini (gpt-5-mini-2025-08-07) from OpenAI. All models were evaluated under the primary condition, in which the original HIRA guideline PDF was provided via each model&#x2019;s native document upload feature. Guideline documents were provided as 3 cancer-specific PDF files corresponding to cervical, uterine, and ovarian cancer. Because the set of user-adjustable hyperparameters differed across models and the evaluation included both reasoning and nonreasoning models, all models were evaluated using provider-default settings, and no parameter was explicitly modified. To assess response stability, each model was run 3 times under the same condition.</p><p>All models received the same standardized prompt template (<xref ref-type="other" rid="box1">Textbox 1</xref>). The system prompt instructed the model to act as an expert reviewer of Korean NHI reimbursement for oncology regimens and to return a structured JSON response containing a 3-class decision (eligible, ineligible, or undeterminable) and a single-sentence rationale. The user prompt specified the cancer type, regimen name, and structured clinical and administrative attributes.</p><boxed-text id="box1"><title> Standardized prompt template for reimbursement eligibility determination.</title><p><bold>System Prompt:</bold></p><p>You are an expert reviewer of Korean National Health Insurance reimbursement for oncology regimens.</p><p>Given the clinical and administrative attributes and any provided material, decide whether the named regimen is reimbursable.</p><p>Respond with exactly one JSON object using this schema:</p><p>{&#x201C;decision&#x201D;:&#x201C;eligible|ineligible|undeterminable,&#x201D;&#x201C;reason&#x201D;:&#x201C;&#x003C;one short sentence&#x003E;&#x201D;}</p><p>The following are the outcome definitions:</p><list list-type="bullet"><list-item><p>Eligible: All required conditions are met.</p></list-item><list-item><p>Ineligible: 1 or more conditions are explicitly not met, regardless of whether other conditions are unevaluable. The condition is present but violates the criterion.</p></list-item><list-item><p>Undeterminable: No condition is explicitly not met, but 1 or more conditions are unevaluable due to absent clinical information. Because no criterion is violated, the case cannot be classified as ineligible, yet a determination cannot be reached.</p></list-item></list><p><bold>User Prompt:</bold></p><p>Cancer type:&#x003C;cancer_type&#x003E;</p><p>Regimen:&#x003C;regimen_name&#x003E;</p><p>Clinical and administrative attributes:&#x003C;structured attributes&#x003E;</p><p>Task: Determine whether this patient is eligible for reimbursement for the regimen above.</p></boxed-text></sec><sec id="s2-4"><title>Statistical Analysis</title><p>Each model was evaluated over 3 independent runs. To capture run-to-run consistency of the models, accuracy was recorded to compute mean and SD. A final predicted label for each case was determined by majority voting across the 3 runs. A 3-way tie was resolved as undeterminable, reflecting the model&#x2019;s own uncertainty about the case.</p><p>All evaluation metrics&#x2014;precision, recall, and <italic>F</italic><sub>1</sub>-score&#x2014;were calculated from these majority-voted predictions. Ninety-five percent CIs for accuracy, precision, and recall were calculated by Wilson method, and those for the <italic>F</italic><sub>1</sub>-score were obtained by bootstrapping (n=1000). The Bhapkar test of marginal homogeneity was used to assess whether the predicted outcome distribution differed from the observed distribution across the 3 outcome classes. Analyses were performed using Python 3.10 with SciPy.</p></sec><sec id="s2-5"><title>Error Analysis</title><p>We further conducted a retrospective error analysis of all misclassified cases to characterize how models failed across the benchmark. Because the benchmark was explicitly designed around a tristate adjudication framework, model errors could be interpreted according to the adjudication principle they violated.</p><p>We therefore conducted a logic-based error analysis in which each expected-predicted outcome pair was mapped to an error category. Representative rationales and guideline locations were then reviewed to characterize recurrent structural sources of the dominant error category. This rationale review was performed blinded to model identity to mitigate potential reviewer bias.</p></sec><sec id="s2-6"><title>Sensitivity Analysis</title><p>Sensitivity analysis consisted of 3 strategies. First, to examine the effect of input format, the guideline file (PDF) was parsed into structured text (Markdown) using the pdfplumber library, and all models were evaluated with the extracted text as input. Second, to assess the effect of information access, search-enabled conditions were evaluated on all models. This setting was intended to reflect scenarios in which patients or health care providers use LLMs with web search enabled. Third, to investigate the effect of prompting strategy, a prompt informed by the error analysis findings was evaluated, instructing models to verify each eligibility condition step by step.</p><p>For the sensitivity analysis, each prediction was dichotomized as correct or incorrect against the reference standard. Pairwise comparisons of correctness between the baseline and each sensitivity condition were then performed using McNemar test, with Benjamini-Hochberg correction for the false discovery rate. Adjusted <italic>P</italic> values are reported as <italic>q</italic> values.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>This study did not involve human participants, real patient data, or protected health information. All patient scenarios were synthetically constructed based on publicly available reimbursement guidelines published by the HIRA in South Korea.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Benchmark Statistics</title><p>Agreement between the utilization review nurse and gynecologic oncology experts was initially 95% (211/222; Cohen &#x03BA;=0.93). Eleven cases were identified in which specific qualifying conditions had not been fully reflected in the benchmark scenarios. These discrepancies were resolved through consensus while preserving the balanced class structure.</p><p>The final benchmark composition is summarized in <xref ref-type="table" rid="table2">Table 2</xref>. A total of 74 anticancer regimens (15 cervical, 17 uterine, and 42 ovarian) were each assigned 3 outcome scenarios, yielding 222 cases. The mean number of eligibility conditions per regimen was 4.2 (SD 1.9; range 2&#x2010;11). Uterine (mean 4.5, SD 2.8; range 3-11) and ovarian cancer regimens showed comparable mean complexity (mean 4.5, SD 1.3; range 2-8 for both), but uterine cancer included the most complex individual regimens, with up to 11 eligibility conditions. Cervical cancer had the simplest criteria (mean 2.9, SD 1.4; range 2&#x2010;6).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Benchmark composition by cancer type (N=222).</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristic</td><td align="left" valign="bottom">Cervical</td><td align="left" valign="bottom">Uterine</td><td align="left" valign="bottom">Ovarian</td><td align="left" valign="bottom">Total</td></tr></thead><tbody><tr><td align="left" valign="top">Regimens, n</td><td align="left" valign="top">15</td><td align="left" valign="top">17</td><td align="left" valign="top">42</td><td align="left" valign="top">74</td></tr><tr><td align="left" valign="top">Benchmark cases, n</td><td align="left" valign="top">45</td><td align="left" valign="top">51</td><td align="left" valign="top">126</td><td align="left" valign="top">222</td></tr><tr><td align="left" valign="top">Unique clinical attribute types, n</td><td align="left" valign="top">10</td><td align="left" valign="top">14</td><td align="left" valign="top">24</td><td align="left" valign="top">38<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">Conditions per regimen, mean (SD; range)</td><td align="left" valign="top">2.9 (1.4; 2-6)</td><td align="left" valign="top">4.5 (2.8; 3-11)</td><td align="left" valign="top">4.5 (1.3; 2-8)</td><td align="left" valign="top">4.2 (1.9; 2-11)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>The total reflects the number of distinct attribute types after deduplication across cancer types.</p></fn></table-wrap-foot></table-wrap><p>The upper range of complexity in uterine cancer was largely driven by immunotherapy regimens such as pembrolizumab plus lenvatinib, which required up to 11 simultaneous conditions, including deficient mismatch repair or microsatellite instability-high status, Eastern Cooperative Oncology Group performance score, and prior treatment history.</p></sec><sec id="s3-2"><title>Overall LLM Performance</title><p>Overall accuracy ranged from 77.9% (GPT-5.4) to 88.7% (Gemini 3.1 Pro) across the 6 models. No malformed outputs were observed. Performance was relatively stable across runs, with SDs ranging from 0.3 to 2.0 percentage points (pp). Despite these differences in overall accuracy, all models showed the same general pattern of substantially lower recall for undeterminable cases than for eligible or ineligible cases. Three-way ties across runs were rare, occurring in only 1 case out of all model evaluations (222 cases &#x00D7; 6 models). Detailed results are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-3"><title>Analysis by Outcome Class</title><p>The LLM output was analyzed by 3 outcome classes (<xref ref-type="fig" rid="figure3">Figure 3</xref>). For eligible cases, recall was high across models (93.2%&#x2010;98.6%), and ineligible cases showed similarly strong performance (86.5%&#x2010;97.3%). By contrast, undeterminable cases showed a marked decline across all models, ranging from 44.6% to 70.3%. Among the models, Gemini 3.1 Pro showed the highest recall for undeterminable cases (70.3%), whereas GPT-5.4 showed the lowest (44.6%). The full results with precision, recall, <italic>F</italic><sub>1</sub>-score, and confusion matrix are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Recall by model with 95% CI (n=74 per class, majority vote). Eligible (solid blue) cases met all required conditions. Ineligible (hatched blue) cases contained at least 1 explicitly not-met condition. Undeterminable (red) cases contained no explicitly not-met condition but at least 1 unevaluable condition due to missing information.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e95877_fig03.png"/></fig><p>Analysis of misclassification direction for undeterminable cases (<xref ref-type="table" rid="table3">Table 3</xref>, 44.6%&#x2010;70.3%) revealed that models predominantly misclassified these cases as eligible (28.4%&#x2010;50%) rather than ineligible (0%&#x2010;5.4%). This pattern was observed across all 6 models. Consistent with this pattern, Bhapkar tests showed significant marginal heterogeneity between observed and predicted outcome distributions for all models (all <italic>P</italic>&#x003C;.001; <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Misclassification direction for undeterminable cases by model (majority vote, n=74 per model).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Predicted eligible, n (%)</td><td align="left" valign="bottom">Predicted ineligible, n (%)</td><td align="left" valign="bottom">Predicted undeterminable, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Gemini 3.1 Pro</td><td align="left" valign="top">21 (28.4)</td><td align="left" valign="top">1 (1.4)</td><td align="left" valign="top">52 (70.3)</td></tr><tr><td align="left" valign="top">Gemini 3 Flash</td><td align="left" valign="top">33 (44.6)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">41 (55.4)</td></tr><tr><td align="left" valign="top">Claude Opus 4.6</td><td align="left" valign="top">31 (41.9)</td><td align="left" valign="top">1 (1.4)</td><td align="left" valign="top">42 (56.8)</td></tr><tr><td align="left" valign="top">Claude Sonnet 4.6</td><td align="left" valign="top">35 (47.3)</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">39 (52.7)</td></tr><tr><td align="left" valign="top">GPT-5.4</td><td align="left" valign="top">37 (50)</td><td align="left" valign="top">4 (5.4)</td><td align="left" valign="top">33 (44.6)</td></tr><tr><td align="left" valign="top">GPT-5 Mini</td><td align="left" valign="top">31 (41.9)</td><td align="left" valign="top">2 (2.7)</td><td align="left" valign="top">41 (55.4)</td></tr></tbody></table></table-wrap></sec><sec id="s3-4"><title>Analysis by Cancer Type</title><p>Performance varied across cancer types. Cervical cancer showed the highest mean recall across 6 models (92.6%), followed by ovarian (82.9%) and uterine (71.9%) cancer. Per-regimen complexity was lowest for cervical cancer (mean 2.9, SD 1.4; range 2&#x2010;6 conditions) and highest for uterine cancer (mean 4.5, SD 2.8; range 3&#x2010;11). Notably, recall varied substantially by outcome class within each cancer type (<xref ref-type="table" rid="table4">Table 4</xref>). While eligible and ineligible recall remained high across cancer types (89.7%&#x2010;100%), undeterminable recall dropped sharply, particularly for uterine cancer (mean 16.7%), where per-regimen complexity was highest. Model-wise results by cancer type and outcome class are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Pooled recall across cancer types and outcome classes with 95% CI (6 models).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Cancer type</td><td align="left" valign="bottom">Eligible, % (95% CI)</td><td align="left" valign="bottom">Ineligible, % (95% CI)</td><td align="left" valign="bottom">Undeterminable, % (95% CI)</td><td align="left" valign="bottom">Overall, % (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top">Cervical</td><td align="left" valign="top">100 (95.9&#x2010;100.0)</td><td align="left" valign="top">98.9 (94.0&#x2010;99.8)</td><td align="left" valign="top">78.9 (69.4&#x2010;86.0)</td><td align="left" valign="top">92.6 (88.8&#x2010;95.2)</td></tr><tr><td align="left" valign="top">Uterine</td><td align="left" valign="top">99 (94.7&#x2010;99.8)</td><td align="left" valign="top">100 (96.4&#x2010;100.0)</td><td align="left" valign="top">16.7 (10.7&#x2010;25.1)</td><td align="left" valign="top">71.9 (66.6&#x2010;76.6)</td></tr><tr><td align="left" valign="top">Ovarian</td><td align="left" valign="top">95.6 (92.4&#x2010;97.5)</td><td align="left" valign="top">89.7 (85.3&#x2010;92.9)</td><td align="left" valign="top">63.5 (57.4&#x2010;69.2)</td><td align="left" valign="top">82.9 (80.1&#x2010;85.5)</td></tr></tbody></table></table-wrap></sec><sec id="s3-5"><title>Error Analysis</title><p>The tristate benchmark design enabled a logic-based characterization of model errors. All 235 misclassified cases were assigned to 1 of 3 error types according to how each misclassification violated the adjudication logic; predicted outcomes and single-sentence rationales were reviewed to characterize each category (<xref ref-type="table" rid="table5">Table 5</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Error categories enabled by the tristate adjudication framework.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Error type and expected outcome</td><td align="left" valign="bottom">Predicted outcome</td><td align="left" valign="bottom">Explanation</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Information gap-filling</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Undeterminable</td><td align="left" valign="top">Eligible</td><td align="left" valign="top">One or more conditions were not provided, but the model assumed a met value for the missing attribute.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Undeterminable</td><td align="left" valign="top">Ineligible</td><td align="left" valign="top">One or more conditions were not provided, but the model assumed a not-met value for the missing attribute.</td></tr><tr><td align="left" valign="top" colspan="3">Criterion misapplication</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Eligible</td><td align="left" valign="top">Ineligible</td><td align="left" valign="top">All conditions were met, but the model applied a restriction not present in the guideline.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineligible</td><td align="left" valign="top">Eligible</td><td align="left" valign="top">One or more conditions were explicitly not met, but the model confirmed eligibility based on the remaining met conditions.</td></tr><tr><td align="left" valign="top" colspan="3">False uncertainty</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Eligible</td><td align="left" valign="top">Undeterminable</td><td align="left" valign="top">All conditions were met, but the model treated one or more conditions as unevaluable and withheld judgment.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Ineligible</td><td align="left" valign="top">Undeterminable</td><td align="left" valign="top">One or more conditions were explicitly not met, but the model prioritized an unevaluable condition over the not-met finding.</td></tr></tbody></table></table-wrap><p>The most common error was information gap-filling (196/235, 83.4%), in which models inferred a definitive outcome from incomplete scenarios rather than recognizing missing information. The second most common error type was criterion misapplication (n=20, 8.5%), in which models classified cases as eligible despite 1 or more clearly not-met conditions or, conversely, classified cases as ineligible despite all conditions being met. Third, false uncertainty (n=19, 8.1%) refers to cases where models inferred undeterminable despite sufficient information to adjudicate eligible or ineligible. This pattern occurred most frequently in ineligible scenarios where 1 or more conditions were unevaluable but at least 1 clearly unmet condition was present.</p><p>To further understand the dominant error type, information gap cases were analyzed in relation to guideline structure. Reimbursement guidelines are organized hierarchically (<xref ref-type="other" rid="box2">Textbox 2</xref>), and model performance varied with structural complexity. In particular, subheading (L2) and footnote (L4) were the weak points of the models.</p><boxed-text id="box2"><title> Example structure of anticancer drug eligibility guideline.</title><p>[L1] Global Caveat: Cancer-type-level conditions applied to all regimens (eg, default scope of covered indications)</p><p>[L2] Treatment Intent A</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500; Table</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen A [Indication A]</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen B [Indication B]</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;&#x2026;</p><p>[L2] Treatment Intent B</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500; Table</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen A [Indication C]</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen C [Indication D]</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;&#x2026;</p><p>[L2] Treatment Intent C</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen D [Indication E]</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen E [Indication F]</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;&#x2026;</p><p>[L2] Treatment Intent D</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;[L3] Regimen F [Indication G]*</p><p><named-content content-type="indent">&#x2003;</named-content><named-content content-type="indent">&#x2003;</named-content>&#x2514;&#x2500;&#x2026;.</p><p>[L4] *Footnote: Externalized condition at document tail, applied via explicit cross-reference (eg, &#x201C;see Note 1&#x201D;)</p></boxed-text><p>The largest subtype involved models ignoring the document subheading (L2) that specifies therapeutic intent (eg, palliative, maintenance). This distinction is clinically significant because identical regimens may appear under multiple therapeutic purposes with different indications, and reimbursement is granted only for the purpose under which the regimen is listed.</p><p>Another subtype involved footnote (L4) recognition. Designated institution requirement for immune checkpoint inhibitor, restricting eligible prescribing sites to those with qualified specialist staffing, appeared only in guideline footnotes, and most models systematically missed this condition.</p></sec><sec id="s3-6"><title>Sensitivity Analysis</title><p>Building on the failure modes identified in the error analysis, 3 sensitivity conditions were evaluated (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>). These analyses included structured text, web search&#x2013;enabled, and structure-guided prompt conditions.</p><p>In the structured text condition, the parsed guideline in Markdown format was provided to the models. Performance declined across all 6 models, with statistically significant degradation observed in the 2 Claude models (Claude Opus 4.6: &#x0394;=&#x2212;10.4 pp; Claude Sonnet 4.6: &#x0394;=&#x2212;10.8 pp; both <italic>q</italic>&#x003C;.001). The remaining 4 models showed smaller declines that did not reach significance.</p><p>The web search&#x2013;enabled condition assessed the effect of additional information access alongside the guideline document, while the structure-guided prompt condition incorporated the hierarchical levels of the guideline (<xref ref-type="other" rid="box2">Textbox 2</xref>) into the prompt and instructed models to verify each eligibility condition step by step against the corresponding level. Neither condition produced a statistically significant change from baseline in any model (all <italic>q</italic>&#x003E;.05).</p><p>Notably, web search tool invocation rates ranged from 0% to 3.2% across the 6 models under the web search&#x2013;enabled condition. Token usage nevertheless increased substantially: OpenAI models showed approximately 4500 additional input tokens per case from the embedded tool specification, while Claude models produced longer reasoning.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study developed a benchmark for anticancer drug reimbursement eligibility determination covering 3 gynecologic cancers and used it to evaluate 6 LLMs from 3 providers. The benchmark was based on a tristate adjudication framework that distinguishes condition-level states (met, not met, unevaluable) and case-level outcomes (eligible, ineligible, and undeterminable) for eligibility verification. Case-level outcomes are derived from the aggregation of condition-level states.</p><p>The principal findings are threefold. First, overall accuracy varied across models, ranging from 77.9% to 88.7%, indicating broadly comparable performance across the 6 evaluated models. Second, performance varied sharply by outcome class. While eligible and ineligible cases were classified with consistently high recall, undeterminable cases posed the greatest challenge across all models, revealing a consistent limitation to recognize when clinical information is insufficient for a determination. Third, the tristate benchmark design enabled a logic-based characterization of model errors. Rather than treating all incorrect predictions as equivalent, the framework distinguished information gap-filling, criterion misapplication, and false uncertainty according to how each misclassification violated the expected adjudication logic.</p><p>These findings suggest that LLMs may help reduce the manual burden of eligibility review for clearly eligible and ineligible cases. However, these models should be positioned as decision-support tools operating under human oversight, not as autonomous decision-makers. The systematic failure in undeterminable cases represents the most clinically consequential limitation, as models tend to resolve ambiguity toward eligibility rather than deferring judgment when information is incomplete.</p><p>The dominant undeterminable failure pattern&#x2014;information gap-filling&#x2014;has direct clinical implications. When models misclassified undeterminable cases, the predominant direction was toward eligible rather than ineligible, indicating a tendency to infer eligibility from incomplete information. The severity of this pattern varied by cancer type, corresponding to the number of eligibility conditions per regimen. Uterine cancer regimens, which had the largest mean number of eligibility conditions, showed the lowest undeterminable recall.</p><p>This tendency has potential financial implications for hospitals. Under South Korea&#x2019;s retrospective claims review structure, ineligibility identified during postadministration audit can result in full denial of previously reimbursed drug costs, with the burden shifted to the prescribing institution. Because high-cost agents such as immune checkpoint inhibitors, antiangiogenic agents, and poly (ADP-ribose) polymerase inhibitors are reimbursed under exception-coverage provisions for severe diseases, even a small number of misclassifications can accumulate into substantial losses. Given that per-cycle costs typically range from KRW 3&#x2010;6 million (approximately US $2200&#x2010;$4400), cumulative nonrecoverable costs can reach tens of millions of KRW per patient. Although the per-case inference cost itself is minimal (approximately US $0.002&#x2010;$0.10 across the 6 models), an erroneous determination at this cost can still translate into substantial financial losses through retrospective denial.</p><p>Meanwhile, sensitivity analysis revealed 3 patterns. First, converting the guideline file to structured text degraded performance across all models, indicating that document structure conveys eligibility logic lost during text extraction. Second, enabling web search did not produce a statistically significant change from baseline for any model (all <italic>q</italic>&#x003E;.05), with models defaulting to the provided guideline rather than calling the search tool. This pattern indicates that in realistic scenarios where authoritative source documents are accessible to the LLM, the models rely on the provided context, and enabling the search tool neither improved nor degraded outcomes while increasing inference cost. Third, structure-guided prompting did not produce a statistically significant improvement for any of the 6 models (all <italic>q</italic>&#x003E;.05), indicating that explicit structural guidance combined with step-by-step per-condition verification was insufficient to overcome the inherent limitations in hierarchical guideline reasoning observed in this benchmark.</p><p>Taken together, these results indicate that current LLMs are not yet reliable as standalone tools for reimbursement eligibility determination, particularly when clinical information is incomplete. Because neither input reformatting, information augmentation, nor prompt modification alone reliably improved performance, more systematic integration strategies will be needed to deploy LLMs safely in reimbursement workflows.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>Most LLM benchmarks in the medical domain have focused on knowledge assessment through multiple-choice examinations such as MedQA [<xref ref-type="bibr" rid="ref21">21</xref>], while real-world task evaluation remains scarce [<xref ref-type="bibr" rid="ref22">22</xref>]. Recent frameworks such as MedHELM [<xref ref-type="bibr" rid="ref23">23</xref>] have begun addressing this gap but reveal that administrative and workflow tasks remain the weakest evaluation category.</p><p>Within guideline-based reasoning, MedGUIDE [<xref ref-type="bibr" rid="ref24">24</xref>] and CPGPrompt [<xref ref-type="bibr" rid="ref25">25</xref>] evaluated LLM adherence to cancer treatment decision trees and referral classification guidelines, respectively, both finding that models frequently deviate from structured conditional logic. Outside the medical domain, RuleArena [<xref ref-type="bibr" rid="ref26">26</xref>] reported similar difficulties with complex rule-guided reasoning. However, none of these benchmarks test whether models can recognize when information is insufficient for a determination; all evaluate scenarios in which sufficient information is provided and a definitive answer exists.</p><p>This limitation is directly relevant to our findings. Recent abstention studies&#x2014;AbstentionBench [<xref ref-type="bibr" rid="ref27">27</xref>] across 20 datasets and MedAbstain [<xref ref-type="bibr" rid="ref28">28</xref>] in medical question answering&#x2014;showed that providing an explicit abstention option consistently increased model uncertainty and safer abstention behavior, whereas scaling model size or applying structure-guided prompting yielded little improvement.</p><p>Our undeterminable outcome class parallels these abstention scenarios in a domain-specific context. However, whereas these benchmarks test whether models can detect missing information within a self-contained question, our benchmark additionally requires models to extract eligibility conditions from a hierarchically structured guideline document and then determine whether the patient scenario provides sufficient information to evaluate each condition.</p><p>To our knowledge, no prior benchmark has evaluated LLM performance on health insurance reimbursement eligibility. This study addresses this gap by introducing a benchmark grounded in national anticancer drug coverage criteria with expert validation, incorporating an undeterminable outcome class that explicitly tests recognition of incomplete information.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study has several limitations. First, the benchmark was limited to 3 gynecologic cancers based on South Korean NHI guidelines and relied on a relatively small dataset. Its generalizability to other cancer types, guideline versions, or insurance systems requires further validation.</p><p>Second, all patient scenarios were synthetically constructed in a structured format. This design does not capture the additional uncertainty introduced by extracting relevant attributes from unstructured records. Because real-world clinical documentation is often incomplete and unstructured, the performance observed in this study may overestimate model utility in practice.</p><p>Third, some eligibility conditions appeared in the global caveat and footnotes; errors involving these structural elements may partly reflect document representation rather than reasoning limitations of models alone.</p><p>Fourth, our benchmark adopts a strict rule that treats any missing information required by guideline criteria as undeterminable. This reflects a fundamental distinction between clinical reasoning and administrative reasoning. For instance, the absence of a prior treatment record may suggest a patient who has not previously been treated in clinical reasoning but constitutes missing evidence in administrative adjudication. Findings should, therefore, be interpreted as specific to administrative reasoning tasks.</p><p>Fifth, while the qualitative error analysis was conducted with model identities blinded, LLM outputs often contain stylistic fingerprints (eg, characteristic phrasing or cadences) that may make perfect blinding difficult. Residual unblinding may, therefore, introduce some bias into our error categorization.</p><p>Finally, models were tested with provider-default settings without fine-tuning or agentic frameworks, which may underestimate achievable performance.</p></sec><sec id="s4-4"><title>Future Work</title><p>Several directions arise from these limitations. Future work should expand the benchmark to additional cancer types and guideline versions and validate it using real clinical documentation to better reflect performance under unstructured and incomplete records. A direct human performance baseline, in which experts solve the same cases under identical conditions, would also be valuable to contextualize LLM accuracy against expert-level performance.</p><p>Methodologically, agentic approaches that iteratively verify each condition against the guideline may improve accuracy on complex hierarchical structures. In addition, hybrid neuro-symbolic architectures that pair LLM-based understanding with structured rule engines have shown initial promise in adjacent health care tasks [<xref ref-type="bibr" rid="ref29">29</xref>]; their applicability to reimbursement reasoning under incomplete information, particularly in handling rigid AND/OR logical constraints, remains to be evaluated.</p></sec><sec id="s4-5"><title>Conclusion</title><p>The tristate adjudication framework introduced in this study offers a distinctive approach to evaluating LLM behavior in logical reasoning contexts characterized by variable information completeness. In this benchmark, LLMs classified clearly eligible and ineligible cases with relatively high recall but showed limited reliability on undeterminable cases. The dominant error pattern was information gap-filling, in which models tended to infer eligibility rather than withhold judgment. These findings indicate that LLMs, in their current form, should be deployed as supervised decision-support tools rather than as independent adjudicators in reimbursement review.</p></sec></sec></body><back><ack><p>The authors used Claude (Opus 4.6, Anthropic, 2026) to assist with improving the readability and language editing of the manuscript. All outputs were reviewed and verified by the authors, who take full responsibility for the final content.</p></ack><notes><sec><title>Funding</title><p>The authors declared that no financial support was received for this work.</p></sec><sec><title>Data Availability</title><p>Upon publication, the benchmark dataset, prompt templates, and evaluation scripts are available on GitHub [<xref ref-type="bibr" rid="ref30">30</xref>] and archived on Zenodo [<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: JS, TK</p><p>Data curation: JS</p><p>Formal analysis: JS</p><p>Methodology: JS, TK, JHK</p><p>Software: JS</p><p>Supervision: TK, JHK</p><p>Validation: JHK</p><p>Visualization: JS</p><p>Writing &#x2013; original draft: JS</p><p>Writing &#x2013; review &#x0026; editing: JS, TK, JHK</p></fn><fn fn-type="conflict"><p>TK is cofounder and chief executive officer of ETOILE Inc, a health care artificial intelligence company. JS is an employee of ETOILE Inc. The benchmark described in this manuscript is not a commercial product of ETOILE Inc. The authors have no financial relationships with any of the commercial large language model vendors evaluated in this study. JHK reports no competing interests.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">HIRA</term><def><p>Health Insurance Review &#x0026; Assessment Service</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">NHI</term><def><p>National Health Insurance</p></def></def-item><def-item><term id="abb4">pp</term><def><p>percentage points</p></def></def-item><def-item><term id="abb5">TRIPOD-LLM</term><def><p>Transparent Reporting of a Multivariable Prediction Model for Individual Prognosis or Diagnosis-Large Language Model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiwani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Himmelstein</surname><given-names>D</given-names> </name><name name-style="western"><surname>Woolhandler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kahn</surname><given-names>JG</given-names> </name></person-group><article-title>Billing and insurance-related administrative costs in United States&#x2019; health care: synthesis of micro-costing evidence</article-title><source>BMC Health Serv Res</source><year>2014</year><month>11</month><day>13</day><volume>14</volume><fpage>556</fpage><pub-id pub-id-type="doi">10.1186/s12913-014-0556-7</pub-id><pub-id pub-id-type="medline">25540104</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tseng</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Richman</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Schulman</surname><given-names>KA</given-names> </name></person-group><article-title>Administrative costs associated with physician billing and insurance-related activities at an academic health care system</article-title><source>JAMA</source><year>2018</year><month>02</month><day>20</day><volume>319</volume><issue>7</issue><fpage>691</fpage><lpage>697</lpage><pub-id pub-id-type="doi">10.1001/jama.2017.19148</pub-id><pub-id pub-id-type="medline">29466590</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Erickson</surname><given-names>SM</given-names> </name><name name-style="western"><surname>Rockwern</surname><given-names>B</given-names> </name><name name-style="western"><surname>Koltov</surname><given-names>M</given-names> </name><name name-style="western"><surname>McLean</surname><given-names>RM</given-names> </name><collab>Medical Practice and Quality Committee of the American College of Physicians</collab></person-group><article-title>Putting patients first by reducing administrative tasks in health care: a position paper of the American College of Physicians</article-title><source>Ann Intern Med</source><year>2017</year><month>05</month><day>2</day><volume>166</volume><issue>9</issue><fpage>659</fpage><lpage>661</lpage><pub-id pub-id-type="doi">10.7326/M16-2697</pub-id><pub-id pub-id-type="medline">28346948</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kyle</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Wade</surname><given-names>CG</given-names> </name><name name-style="western"><surname>Yaver</surname><given-names>M</given-names> </name></person-group><article-title>Patient administrative burden: a scoping review</article-title><source>Health Aff Sch</source><year>2025</year><month>11</month><volume>3</volume><issue>11</issue><fpage>qxaf216</fpage><pub-id pub-id-type="doi">10.1093/haschl/qxaf216</pub-id><pub-id pub-id-type="medline">41278120</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Richman</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Kaplan</surname><given-names>RS</given-names> </name><name name-style="western"><surname>Kohli</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Billing and insurance-related administrative costs: a cross-national analysis</article-title><source>Health Aff (Millwood)</source><year>2022</year><month>08</month><volume>41</volume><issue>8</issue><fpage>1098</fpage><lpage>1106</lpage><pub-id pub-id-type="doi">10.1377/hlthaff.2022.00241</pub-id><pub-id pub-id-type="medline">35914203</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sohn</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jung</surname><given-names>M</given-names> </name></person-group><article-title>Effects of public and private health insurance on medical service utilization in the National Health Insurance System: national panel study in the Republic of Korea</article-title><source>BMC Health Serv Res</source><year>2016</year><month>09</month><day>21</day><volume>16</volume><issue>1</issue><fpage>503</fpage><pub-id pub-id-type="doi">10.1186/s12913-016-1746-2</pub-id><pub-id pub-id-type="medline">27654146</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Yoon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>LY</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DS</given-names> </name></person-group><article-title>Towards actualizing the value potential of Korea Health Insurance Review and Assessment (HIRA) data as a resource for health research: strengths, limitations, applications, and strategies for optimal use of HIRA data</article-title><source>J Korean Med Sci</source><year>2017</year><month>05</month><volume>32</volume><issue>5</issue><fpage>718</fpage><lpage>728</lpage><pub-id pub-id-type="doi">10.3346/jkms.2017.32.5.718</pub-id><pub-id pub-id-type="medline">28378543</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>S</given-names> </name></person-group><article-title>A guide for the utilization of Health Insurance Review and Assessment Service National Patient Samples</article-title><source>Epidemiol Health</source><year>2014</year><volume>36</volume><fpage>e2014008</fpage><pub-id pub-id-type="doi">10.4178/epih/e2014008</pub-id><pub-id pub-id-type="medline">25078381</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shin</surname><given-names>HC</given-names> </name><name name-style="western"><surname>Park</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>YT</given-names> </name><name name-style="western"><surname>Jo</surname><given-names>EC</given-names> </name></person-group><article-title>Healthcare utilization monitoring system in Korea</article-title><source>Healthc Inform Res</source><year>2015</year><month>07</month><volume>21</volume><issue>3</issue><fpage>184</fpage><lpage>190</lpage><pub-id pub-id-type="doi">10.4258/hir.2015.21.3.184</pub-id><pub-id pub-id-type="medline">26279955</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yun</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Heo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DS</given-names> </name></person-group><article-title>National expenditures on anticancer and immunomodulating agents during 2013-2022 in Korea</article-title><source>J Korean Med Sci</source><year>2025</year><month>02</month><day>10</day><volume>40</volume><issue>5</issue><fpage>e16</fpage><pub-id pub-id-type="doi">10.3346/jkms.2025.40.e16</pub-id><pub-id pub-id-type="medline">39938871</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="report"><article-title>Cancer drug and regimen guidelines: public announcements [Report in Korean]</article-title><year>2026</year><access-date>2026-06-02</access-date><publisher-name>Health Insurance Review &#x0026; Assessment Service</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.hira.or.kr/bbsDummy.do?pgmid=HIRAA030023010000">https://www.hira.or.kr/bbsDummy.do?pgmid=HIRAA030023010000</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="report"><article-title>AI as a healthcare ally: how Americans are navigating the system with ChatGPT</article-title><year>2026</year><access-date>2026-06-02</access-date><publisher-name>OpenAI</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/pdf/2cb29276-68cd-4ec6-a5f4-c01c5e7a36e9/OpenAI-AI-as-a-Healthcare-Ally-Jan-2026.pdf">https://cdn.openai.com/pdf/2cb29276-68cd-4ec6-a5f4-c01c5e7a36e9/OpenAI-AI-as-a-Healthcare-Ally-Jan-2026.pdf</ext-link></comment></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Seo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Evaluation framework of large language models in medical documentation: development and usability study</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>20</day><volume>26</volume><fpage>e58329</fpage><pub-id pub-id-type="doi">10.2196/58329</pub-id><pub-id pub-id-type="medline">39566044</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Park</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>JH</given-names> </name><name name-style="western"><surname>You</surname><given-names>SC</given-names> </name></person-group><article-title>Large language model assistant for emergency department discharge documentation</article-title><source>JAMA Netw Open</source><year>2025</year><month>10</month><day>1</day><volume>8</volume><issue>10</issue><fpage>e2538427</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2025.38427</pub-id><pub-id pub-id-type="medline">41118162</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jones</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Torgbi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Tayyar Madabushi</surname><given-names>H</given-names> </name></person-group><article-title>Improving the understandability of clinical guidelines: development and evaluation of a GPT-4-based pipeline</article-title><source>J Med Internet Res</source><year>2026</year><month>02</month><day>23</day><volume>28</volume><fpage>e81915</fpage><pub-id pub-id-type="doi">10.2196/81915</pub-id><pub-id pub-id-type="medline">41730207</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>NS</given-names> </name><name name-style="western"><surname>Keloth</surname><given-names>VK</given-names> </name><name name-style="western"><surname>You</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Detection of gastrointestinal bleeding with large language models to aid quality improvement and appropriate reimbursement</article-title><source>Gastroenterology</source><year>2025</year><month>01</month><volume>168</volume><issue>1</issue><fpage>111</fpage><lpage>120</lpage><pub-id pub-id-type="doi">10.1053/j.gastro.2024.09.014</pub-id><pub-id pub-id-type="medline">39304088</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hou</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bian</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhuang</surname><given-names>Y</given-names> </name></person-group><article-title>Enhancing medical coding efficiency through domain-specific fine-tuned large language models</article-title><source>Npj Health Syst</source><year>2025</year><volume>2</volume><issue>1</issue><fpage>14</fpage><pub-id pub-id-type="doi">10.1038/s44401-025-00018-3</pub-id><pub-id pub-id-type="medline">40321467</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Smith</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Araya-Guerra</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bublitz</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Missing clinical information during primary care visits</article-title><source>JAMA</source><year>2005</year><month>02</month><day>2</day><volume>293</volume><issue>5</issue><fpage>565</fpage><lpage>571</lpage><pub-id pub-id-type="doi">10.1001/jama.293.5.565</pub-id><pub-id pub-id-type="medline">15687311</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Burnett</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Deelchand</surname><given-names>V</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>BD</given-names> </name><name name-style="western"><surname>Moorthy</surname><given-names>K</given-names> </name><name name-style="western"><surname>Vincent</surname><given-names>C</given-names> </name></person-group><article-title>Missing clinical information in NHS hospital outpatient clinics: prevalence, causes and effects on patient care</article-title><source>BMC Health Serv Res</source><year>2011</year><month>05</month><day>23</day><volume>11</volume><fpage>114</fpage><pub-id pub-id-type="doi">10.1186/1472-6963-11-114</pub-id><pub-id pub-id-type="medline">21605359</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gallifant</surname><given-names>J</given-names> </name><name name-style="western"><surname>Afshar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>The TRIPOD-LLM reporting guideline for studies using large language models</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>60</fpage><lpage>69</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03425-5</pub-id><pub-id pub-id-type="medline">39779929</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name></person-group><article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title><source>Appl Sci</source><year>2021</year><volume>11</volume><issue>14</issue><fpage>6421</fpage><pub-id pub-id-type="doi">10.3390/app11146421</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>H</given-names> </name><name name-style="western"><surname>Fuentes</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Holistic evaluation of large language models for medical tasks with MedHELM</article-title><source>Nat Med</source><year>2026</year><month>03</month><volume>32</volume><issue>3</issue><fpage>943</fpage><lpage>951</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-04151-2</pub-id><pub-id pub-id-type="medline">41559415</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hao</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>MedGUIDE: benchmarking clinical decision-making in large language models</article-title><access-date>2026-06-02</access-date><conf-name>39th Conference on Neural Information Processing Systems (NeurIPS 2025)</conf-name><conf-date>Dec 2-7, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://openreview.net/pdf/07c2a85f2355803fe92312f6a9808c0e5065ddef.pdf">https://openreview.net/pdf/07c2a85f2355803fe92312f6a9808c0e5065ddef.pdf</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>R</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>CPGPrompt: translating clinical guidelines into large language model-executable decision support</article-title><source>J Am Med Inform Assoc</source><year>2026</year><month>04</month><day>1</day><volume>33</volume><issue>4</issue><fpage>855</fpage><lpage>862</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocag026</pub-id><pub-id pub-id-type="medline">41746783</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hua</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>L</given-names> </name><etal/></person-group><article-title>RuleArena: a benchmark for rule-guided reasoning with LLMs in real-world scenarios</article-title><conf-name>The 63rd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.acl-long.27</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kirichenko</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ibrahim</surname><given-names>M</given-names> </name><name name-style="western"><surname>Chaudhuri</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bell</surname><given-names>SJ</given-names> </name></person-group><article-title>AbstentionBench: reasoning llms fail on unanswerable questions</article-title><access-date>2026-06-02</access-date><conf-name>39th Conference on Neural Information Processing Systems (NeurIPS 2025) Track on Datasets and Benchmarks</conf-name><conf-date>Dec 2-7, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2025/hash/fb122bfc3f0127a94ded048b5b03496f-Abstract-Datasets_and_Benchmarks_Track.html">https://proceedings.neurips.cc/paper_files/paper/2025/hash/fb122bfc3f0127a94ded048b5b03496f-Abstract-Datasets_and_Benchmarks_Track.html</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Machcha</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yerra</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Knowing when to abstain: medical LLMs under clinical uncertainty</article-title><conf-name>19th Conference of the European Chapter of the Association for Computational Linguistics (Volume 1: Long Papers)</conf-name><conf-date>Mar 24-29, 2026</conf-date><pub-id pub-id-type="doi">10.18653/v1/2026.eacl-long.291</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Prenosil</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Weitzel</surname><given-names>TK</given-names> </name><name name-style="western"><surname>Bello</surname><given-names>SC</given-names> </name><etal/></person-group><article-title>Neuro-symbolic AI for auditable cognitive information extraction from medical reports</article-title><source>Commun Med (Lond)</source><year>2025</year><month>11</month><day>21</day><volume>5</volume><issue>1</issue><fpage>491</fpage><pub-id pub-id-type="doi">10.1038/s43856-025-01194-x</pub-id><pub-id pub-id-type="medline">41272253</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><article-title>Seo-see/k-NHIB</article-title><source>GitHub</source><access-date>2026-06-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/seo-see/K-NHIB">https://github.com/seo-see/K-NHIB</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>K-NHIB: Korean National Health Insurance Benchmark</article-title><source>Zenodo</source><access-date>2026-06-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://doi.org/10.5281/zenodo.20563360">https://doi.org/10.5281/zenodo.20563360</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Overall accuracy across repeated runs and majority-voted with 95% CI.</p><media xlink:href="jmir_v28i1e95877_app1.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Large language model performance per class with majority voting (n=74 per class; 95% CI).</p><media xlink:href="jmir_v28i1e95877_app2.docx" xlink:title="DOCX File, 542 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Bhapkar test of marginal homogeneity comparing each large language model's predicted distribution against the observed distribution (eligible=74, ineligible=74, undeterminable =74; balanced design) across the 3 eligibility categories.</p><media xlink:href="jmir_v28i1e95877_app3.docx" xlink:title="DOCX File, 16 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Model-wise recall by cancer type and outcome class with 95% CI.</p><media xlink:href="jmir_v28i1e95877_app4.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Sensitivity analysis results with pairwise comparisons against baseline using McNemar test with Benjamini-Hochberg correction.</p><media xlink:href="jmir_v28i1e95877_app5.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app6"><label>Checklist 1</label><p>TRIPOD-LLM checklist.</p><media xlink:href="jmir_v28i1e95877_app6.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material></app-group></back></article>