<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="letter" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e60695</article-id>
      <article-id pub-id-type="pmid">39405514</article-id>
      <article-id pub-id-type="doi">10.2196/60695</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Research Letter</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Research Letter</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance of Retrieval-Augmented Large Language Models to Recommend Head and Neck Cancer Clinical Trials</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Jin</surname>
            <given-names>Qiao</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chan</surname>
            <given-names>Shan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bracken-Clarke</surname>
            <given-names>Dara</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Chen</surname>
            <given-names>Fangyuan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Hung</surname>
            <given-names>Tony K W</given-names>
          </name>
          <degrees>MD, MBA, MSCR</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Memorial Sloan Kettering Cancer Center</institution>
            <addr-line>530 E 74th St</addr-line>
            <addr-line>New York, NY, 10021</addr-line>
            <country>United States</country>
            <fax>1 646 888 4269</fax>
            <phone>1 646 608 4127</phone>
            <email>hungt@mskcc.org</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3872-5512</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Kuperman</surname>
            <given-names>Gilad J</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4130-1577</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Sherman</surname>
            <given-names>Eric J</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4960-4783</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Ho</surname>
            <given-names>Alan L</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-6885-3742</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Weng</surname>
            <given-names>Chunhua</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9624-0214</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Pfister</surname>
            <given-names>David G</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2109-3221</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Mao</surname>
            <given-names>Jun J</given-names>
          </name>
          <degrees>MD, MSCE</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9229-0380</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Memorial Sloan Kettering Cancer Center</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Columbia University, Department of Biomedical Informatics</institution>
        <addr-line>New York, NY</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Tony K W Hung <email>hungt@mskcc.org</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>15</day>
        <month>10</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e60695</elocation-id>
      <history>
        <date date-type="received">
          <day>18</day>
          <month>5</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>20</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>12</day>
          <month>8</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>3</day>
          <month>9</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Tony K W Hung, Gilad J Kuperman, Eric J Sherman, Alan L Ho, Chunhua Weng, David G Pfister, Jun J Mao. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 15.10.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e60695" xlink:type="simple"/>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>LLM</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>GPT-4</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
        <kwd>clinical trials</kwd>
        <kwd>decision support</kwd>
        <kwd>LookUpTrials</kwd>
        <kwd>cancer care delivery</kwd>
        <kwd>head and neck oncology</kwd>
        <kwd>head and neck cancer</kwd>
        <kwd>retrieval augmented generation</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Chatbots based on large language models (LLMs) have demonstrated the ability to answer oncology examination questions with impressive accuracy without specialized training or reinforcement [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]; however, leveraging LLMs in oncology decision support has not yet demonstrated suitable performance, as LLMs would produce responses that deviate from cancer expert recommendations and guidelines [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. Furthermore, the rapidly changing oncology landscape, including knowledge of cancer clinical trials, limits the meaningful use of LLMs in practice given delays in training dataset updates. To enhance LLM utility in oncology practice, we developed a retrieval-augmented LLM, powered by GPT-4, and evaluated its performance to provide appropriate clinical trial recommendations for a head and neck (HN) cancer population.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <p>On February 1, 2022, we piloted a clinical trial knowledge management application, LookUpTrials, at the Memorial Sloan Kettering Cancer Center (MSK) [<xref ref-type="bibr" rid="ref6">6</xref>]. Using LookUpTrials’ real-time database, we applied retrieval-augmented generation architecture and direct preference optimization to fine-tune GPT-4 as a clinical trial decision assistant [<xref ref-type="bibr" rid="ref7">7</xref>]. Specifically, we enabled retrieval-augmented GPT-4 to respond with up-to-date information—such as trial availability—developed initial prompts, and validated GPT-4 responses from 1120 preference pairs across 56 MSK HN clinical trials. Preference pairs were constructed in [trial : attributes] format, including 20 organizational, investigator, and study attribute types (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Data labels were annotated by author TKWH and cross-verified by 2 trial managers. From November 7, 2023, to January 30, 2024, we collected all consecutive new patient cases and their respective clinical trial recommendations, which were made by consensus during a weekly HN conference attended by 5-8 oncologists with 2 to more than 25 years of practice experience. Cases were categorized by diagnosis, biomarkers, cancer stage, treatment setting, and physician recommendations on clinical trials. Using these cases as test datasets, we prompted retrieval-augmented GPT-4 using a semistructured template, as follows: “Given patient with a &#60;biomarkers&#62;, &#60;diagnosis&#62;, &#60;cancer stage&#62;, &#60;treatment setting&#62;, what are possible clinical trials?” (eg, given a patient with human papillomavirus–associated HN cancer, metastatic stage, in a first-line treatment setting, what are the possible clinical trials?). GPT-4 responses were compared with physician recommendations, with concordance defined a priori: a GPT-4 response was a true positive if it included the recommended clinical trial(s); a true negative if neither the GPT-4 response nor the physicians recommended any clinical trial(s); a false positive if the GPT-4 response recommended clinical trial(s) but physicians did not; and a false negative if the GPT-4 response did not recommend clinical trial(s) but the physicians did. We analyzed the performance of GPT-4 based on its response precision (positive predictive value), recall (sensitivity), and <italic>F</italic><sub>1</sub>-score (harmonic mean of precision and recall). We further analyzed subgroup performance by cancer types and the presence of biomarkers. Statistical analyses were performed using JMP-17.2.0.</p>
      <sec>
        <title>Ethical Considerations</title>
        <p>MSK institutional review board approved the study (application number: 24-120).</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <p>We analyzed 178 patient cases (mean age 66, SD 13.9 years), primarily male (n=134, 75.3%), with local/locally advanced cancers (n=121, 68.0%), including HN (n=109, 61.2%), thyroid (n=29, 16.3%), skin (n=16, 9.0%), or salivary gland (n=14, 7.9%) cancers (<xref ref-type="table" rid="table1">Table 1</xref>). Over one-third of cases had biomarkers (n=66, 37.1%). The majority were treated in the definitive setting with combined modality therapy (n=75, 42.1%), and a modest proportion were treated under clinical trials (n=18, 10.1%). Overall, retrieval-augmented GPT-4 achieved moderate performance (<xref ref-type="table" rid="table2">Table 2</xref>), matching physician clinical trial recommendations with 63.0% precision and 100.0% recall (<italic>F</italic><sub>1</sub>-score 0.77), narrowing a total of 56 HN clinical trials to a range of 0-4 relevant trials per patient case (mean 1, SD 1.2 trials). In comparison, baseline non–retrieval-augmented GPT-4 demonstrated 0.0% precision, recall, and <italic>F</italic><sub>1</sub>-score—given the lack of response specificity to MSK clinical trials. Subgroup precision varied by cancer types (HN cancers: 72.7%, skin cancers: 50.0%, salivary gland cancers: 36.4%, and thyroid cancers: 33.3%) and the presence of biomarkers (presence 72.7%, absent 62.1%).</p>
      <table-wrap position="float" id="table1">
        <label>Table 1</label>
        <caption>
          <p>Baseline characteristics of patient cases (N=178).</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="30"/>
          <col width="540"/>
          <col width="0"/>
          <col width="400"/>
          <thead>
            <tr valign="top">
              <td colspan="4">Characteristics</td>
              <td>Overall values, n (%)</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="4">Age (years), mean (SD)</td>
              <td>66 (13.9)</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Sex</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Female</td>
              <td>44 (24.7)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">Male</td>
              <td>134 (75.3)</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Cancer types</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>Head and neck cancers</bold>
              </td>
              <td>109 (61.2)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Oropharyngeal SCC<sup>a</sup></td>
              <td colspan="2">49 (27.5)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Oral cavity SCC</td>
              <td colspan="2">22 (12.4)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Laryngeal SCC</td>
              <td colspan="2">18 (10.1)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Hypopharyngeal SCC</td>
              <td colspan="2">8 (4.5)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Other</td>
              <td colspan="2">12 (6.7)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>Thyroid cancers</bold>
              </td>
              <td>29 (16.3)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Anaplastic thyroid carcinoma</td>
              <td colspan="2">4 (2.2)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Differentiated thyroid carcinoma</td>
              <td colspan="2">25 (14.0)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>Skin cancers</bold>
              </td>
              <td>16 (9.0)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>Salivary gland cancers</bold>
              </td>
              <td>14 (7.9)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Adenoid cystic carcinoma</td>
              <td colspan="2">5 (2.8)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Nonadenoid cystic carcinoma</td>
              <td colspan="2">9 (5.1)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>Other cancers</bold>
              </td>
              <td>10 (5.6)</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Cancer stage</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Local/locally advanced</td>
              <td colspan="2">121 (68.0)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Recurrent/metastatic</td>
              <td colspan="2">57 (32.0)</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Biomarkers</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>Present</bold>
              </td>
              <td>66 (37.1)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>HPV<sup>b</sup> or p16<sup>c</sup></td>
              <td colspan="2">42 (23.6)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>EBV<sup>d</sup></td>
              <td colspan="2">5 (2.8)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>BRAF<sup>e</sup> mutation</td>
              <td colspan="2">6 (3.4)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>RET<sup>f</sup> mutation</td>
              <td colspan="2">2 (1.1)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>AR<sup>g</sup></td>
              <td colspan="2">2 (1.1)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>HER2<sup>h</sup></td>
              <td colspan="2">3 (1.7)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Other</td>
              <td colspan="2">6 (3.4)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td colspan="3">
                <bold>None</bold>
              </td>
              <td>113 (63.5)</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Treatment settings</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Definitive</td>
              <td colspan="2">93 (52.2)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Palliative</td>
              <td colspan="2">51 (28.7)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Surveillance</td>
              <td colspan="2">15 (8.4)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Adjuvant</td>
              <td colspan="2">13 (7.3)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Diagnostic</td>
              <td colspan="2">6 (3.4)</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Treatment modality</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Combined modality therapy</td>
              <td colspan="2">75 (42.1)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Primary systemic treatment</td>
              <td colspan="2">37 (20.8)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Primary surgical treatment</td>
              <td colspan="2">11 (6.2)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Primary radiation treatment</td>
              <td colspan="2">8 (4.5)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Best supportive care</td>
              <td colspan="2">5 (2.8)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Other</td>
              <td colspan="2">24 (13.5)</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>
                <break/>
              </td>
              <td>Clinical trials</td>
              <td colspan="2">18 (10.1)</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table1fn1">
            <p><sup>a</sup>SCC: squamous cell carcinoma.</p>
          </fn>
          <fn id="table1fn2">
            <p><sup>b</sup>HPV: human papillomavirus.</p>
          </fn>
          <fn id="table1fn3">
            <p><sup>c</sup>p16: p16(INK4A) immunostain.</p>
          </fn>
          <fn id="table1fn4">
            <p><sup>d</sup>EBV: Epstein-Barr virus.</p>
          </fn>
          <fn id="table1fn5">
            <p><sup>e</sup>BRAF: V-Raf murine sarcoma viral oncogene homolog B.</p>
          </fn>
          <fn id="table1fn6">
            <p><sup>f</sup>RET: Rearranged during transfection.</p>
          </fn>
          <fn id="table1fn7">
            <p><sup>g</sup>AR: androgen receptor.</p>
          </fn>
          <fn id="table1fn8">
            <p><sup>h</sup>HER2: human epidermal growth factor receptor 2.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
      <table-wrap position="float" id="table2">
        <label>Table 2</label>
        <caption>
          <p>Performance of retrieval-augmented large language models in matching physician clinical trial recommendations.</p>
        </caption>
        <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
          <col width="30"/>
          <col width="460"/>
          <col width="170"/>
          <col width="170"/>
          <col width="170"/>
          <thead>
            <tr valign="top">
              <td colspan="2">Performance</td>
              <td>Precision (%)</td>
              <td>Recall (%)</td>
              <td><italic>F</italic><sub>1</sub>-score</td>
            </tr>
          </thead>
          <tbody>
            <tr valign="top">
              <td colspan="2">Baseline GPT-4</td>
              <td>0.0</td>
              <td>0.0</td>
              <td>0</td>
            </tr>
            <tr valign="top">
              <td colspan="2">Retrieval-augmented GPT-4</td>
              <td>63.0</td>
              <td>100.0</td>
              <td>0.77</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Subgroups (cancer types)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Head and neck cancers</td>
              <td>72.7</td>
              <td>100.0</td>
              <td>0.84</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Thyroid cancers</td>
              <td>33.3</td>
              <td>100.0</td>
              <td>0.50</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Skin cancers</td>
              <td>50.0</td>
              <td>100.0</td>
              <td>0.67</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Salivary gland cancers</td>
              <td>36.4</td>
              <td>100.0</td>
              <td>0.53</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Other cancers</td>
              <td>—<sup>a</sup></td>
              <td>—</td>
              <td>—</td>
            </tr>
            <tr valign="top">
              <td colspan="5">
                <bold>Subgroups (biomarkers)</bold>
              </td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>Present</td>
              <td>72.7</td>
              <td>100.0</td>
              <td>0.84</td>
            </tr>
            <tr valign="top">
              <td>
                <break/>
              </td>
              <td>None</td>
              <td>62.1</td>
              <td>100.0</td>
              <td>0.77</td>
            </tr>
          </tbody>
        </table>
        <table-wrap-foot>
          <fn id="table2fn1">
            <p><sup>a</sup>Not applicable.</p>
          </fn>
        </table-wrap-foot>
      </table-wrap>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <p>Our study demonstrated that retrieval-augmented GPT-4 achieved moderate performance in matching physician clinical trial recommendations in HN oncology. Comparatively, our retrieval-augmented LLM outperformed its pre–fine-tuned baseline and exceeded the historical performance of pretrained LLMs for providing oncology treatment recommendations by 4-20 folds (<italic>F</italic><sub>1</sub>-score 0.04-0.19) [<xref ref-type="bibr" rid="ref4">4</xref>]. Prior studies have evaluated LLM performance in matching patients to clinical trials, achieving high accuracy [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]; however, to our knowledge, our study is the first to evaluate an oncology-specific, retrieval-augmented LLM as a point-of-care, clinical trial decision support application. As our subgroup analyses demonstrated, LLM performance varies based on the specificity of the prompt and dataset, with enhanced precision achieved through reduced search ambiguity for biomarker-specific trials and cancer types with more well-defined datasets. Study limitations included small sample size, short-term assessment, cross-sectional design, disease-specific focus, and being conducted in a single institution, which limits generalizability and subgroup analyses; however, our study provides insights into the rarely measured performance of retrieval-augmented LLMs using real-world patient cases. Future research is needed to optimize LLMs’ precision and stability and to assess their implementation and effectiveness as a scalable solution for enhancing clinical trial participation. </p>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Preference pairs architecture.</p>
        <media xlink:href="jmir_v26i1e60695_app1.docx" xlink:title="DOCX File , 16 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">HN</term>
          <def>
            <p>head and neck</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">MSK</term>
          <def>
            <p>Memorial Sloan Kettering Cancer Center</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work is supported in part by the Memorial Sloan Kettering Cancer Center Support (grant P30-CA008748) and the 2024 Conquer Cancer—Johnson &#38; Johnson Innovative Medicine Career Development Award (AWD00003905). The corresponding author has full access to all data in the study and takes responsibility for the integrity of the data and the accuracy of the data analysis. We thank all our patients, providers, and administrative staff who supported the study.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>TKWH is the founder of LookUpTrials by TeamX Health. ALH received compensation from or was a part of the advisory boards of Eisai, Exelixis, Novartis, Merck, Rgenta, Coherus, Kura oncology, Remix Therapeutics, McGivney Global Advisors, Prelude Therapeutics, Affyimmune, Elevar Therapeutics, Ayala, Nested Therapeutics, and AstraZeneca. He was the principal investigator of clinica trials for Eisai, Bayer, Genentech, AstraZeneca, Novartis, Merck, BMS, Versatem, Remix Therapuetics, Rgenta Therapeutics, Kura Oncology, Ayala, TILT Therapeutics, Hookipa, Novartis, Daiichi Sankyo, and Astellas. ALH is a co-inventor of patent “Lesional dosimetry methods for tailoring targeted radiotherapy in cancer" (Serial number 63/193700, filed 5/27/21) and serves on the Speaker Bureau of Physician Education Resources. The other authors declare no conflicts of interest.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <day>9</day>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Longwell</surname>
              <given-names>JB</given-names>
            </name>
            <name name-style="western">
              <surname>Grant</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Hirsch</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Binder</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Jang</surname>
              <given-names>RW</given-names>
            </name>
            <name name-style="western">
              <surname>Krishnan</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode medical oncology knowledge: performance on the ASCO and ESMO examination questions</article-title>
          <source>JCO Oncology Practice</source>
          <year>2023</year>
          <month>11</month>
          <volume>19</volume>
          <issue>11_suppl</issue>
          <fpage>511</fpage>
          <lpage>511</lpage>
          <pub-id pub-id-type="doi">10.1200/op.2023.19.11_suppl.511</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kann</surname>
              <given-names>BH</given-names>
            </name>
            <name name-style="western">
              <surname>Foote</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Aerts</surname>
              <given-names>HJWL</given-names>
            </name>
            <name name-style="western">
              <surname>Savova</surname>
              <given-names>GK</given-names>
            </name>
            <name name-style="western">
              <surname>Mak</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Bitterman</surname>
              <given-names>DS</given-names>
            </name>
          </person-group>
          <article-title>Use of artificial intelligence Chatbots for cancer treatment information</article-title>
          <source>JAMA Oncol</source>
          <year>2023</year>
          <month>10</month>
          <day>01</day>
          <volume>9</volume>
          <issue>10</issue>
          <fpage>1459</fpage>
          <lpage>1462</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37615976"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamaoncol.2023.2954</pub-id>
          <pub-id pub-id-type="medline">37615976</pub-id>
          <pub-id pub-id-type="pii">2808731</pub-id>
          <pub-id pub-id-type="pmcid">PMC10450584</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benary</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>XD</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Soll</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hilfenhaus</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Nassir</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sigler</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Knödler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Keller</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Beule</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Keilholz</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Leser</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Rieke</surname>
              <given-names>DT</given-names>
            </name>
          </person-group>
          <article-title>Leveraging large language models for decision support in personalized oncology</article-title>
          <source>JAMA Netw Open</source>
          <year>2023</year>
          <month>11</month>
          <day>01</day>
          <volume>6</volume>
          <issue>11</issue>
          <fpage>e2343689</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37976064"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.43689</pub-id>
          <pub-id pub-id-type="medline">37976064</pub-id>
          <pub-id pub-id-type="pii">2812097</pub-id>
          <pub-id pub-id-type="pmcid">PMC10656647</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hager</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Jungmann</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Holland</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bhagat</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hubrecht</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Knauer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Vielhauer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Makowski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Braren</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kaissis</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rueckert</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title>
          <source>Nat Med</source>
          <year>2024</year>
          <month>07</month>
          <day>04</day>
          <fpage>1</fpage>
          <lpage>26</lpage>
          <pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id>
          <pub-id pub-id-type="medline">38965432</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41591-024-03097-1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hung</surname>
              <given-names>KW</given-names>
            </name>
            <name name-style="western">
              <surname>Dunn</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sherman</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Ho</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Fetten</surname>
              <given-names>JV</given-names>
            </name>
            <name name-style="western">
              <surname>Michel</surname>
              <given-names>LS</given-names>
            </name>
            <name name-style="western">
              <surname>Kriplani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Baxi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Sanford</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Wong</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>NY</given-names>
            </name>
            <name name-style="western">
              <surname>Kuperman</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Pfister</surname>
              <given-names>DG</given-names>
            </name>
          </person-group>
          <article-title>LookUpTrials: assessment of an artificial intelligence-powered mobile application to engage oncology providers in clinical trials</article-title>
          <source>JCO GO</source>
          <year>2023</year>
          <month>08</month>
          <volume>9</volume>
          <issue>Supplement_1</issue>
          <fpage>111</fpage>
          <lpage>111</lpage>
          <pub-id pub-id-type="doi">10.1200/go.2023.9.supplement_1.111</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rafailov</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mitchell</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Manning</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ermon</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Finn</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Direct preference optimization: your language model is secretly a reward model</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on July 29, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2305.18290</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jin</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Floudas</surname>
              <given-names>CS</given-names>
            </name>
          </person-group>
          <article-title>Matching patients to clinical trials with large language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 27, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2307.15051</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Unlu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Shin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mailly</surname>
              <given-names>CJ</given-names>
            </name>
            <name name-style="western">
              <surname>Oates</surname>
              <given-names>MF</given-names>
            </name>
            <name name-style="western">
              <surname>Tucci</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Varugheese</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Wagholikar</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Scirica</surname>
              <given-names>BM</given-names>
            </name>
            <name name-style="western">
              <surname>Blood</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Aronson</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Retrieval-augmented generation–enabled GPT-4 for clinical trial screening</article-title>
          <source>NEJM AI</source>
          <year>2024</year>
          <month>06</month>
          <day>27</day>
          <volume>1</volume>
          <issue>7</issue>
          <pub-id pub-id-type="doi">10.1056/aioa2400181</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wornow</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lozano</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dash</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jindal</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mahaffey</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Zero-shot clinical trial patient matching with LLMs</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on April 10, 2024</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2402.05125</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
