<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e63626</article-id>
      <article-id pub-id-type="pmid">39908540</article-id>
      <article-id pub-id-type="doi">10.2196/63626</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Performance Evaluation of Large Language Models in Cervical Cancer Management Based on a Standardized Questionnaire: Comparative Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Keshtkar</surname>
            <given-names>Kamyab</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mao</surname>
            <given-names>Siqi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Kalluchi</surname>
            <given-names>Achyuth</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Kuerbanjiang</surname>
            <given-names>Warisijiang</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0008-2540-0613</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Peng</surname>
            <given-names>Shengzhe</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-7575-3608</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Jiamaliding</surname>
            <given-names>Yiershatijiang</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0004-9290-2091</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Yi</surname>
            <given-names>Yuexiong</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution/>
            <institution>Department of Gynecology</institution>
            <institution>Zhongnan Hospital of Wuhan University</institution>
            <addr-line>169 Donghu Road</addr-line>
            <addr-line>Wuhan, Hubei Province, 430071</addr-line>
            <country>China</country>
            <phone>86 15671669885</phone>
            <fax>86 027 67813142</fax>
            <email>yiyuexiong@163.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-5623-117X</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Gynecology</institution>
        <institution>Zhongnan Hospital of Wuhan University</institution>
        <addr-line>Wuhan, Hubei Province</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yuexiong Yi <email>yiyuexiong@163.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>5</day>
        <month>2</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e63626</elocation-id>
      <history>
        <date date-type="received">
          <day>25</day>
          <month>6</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>27</day>
          <month>9</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>1</day>
          <month>11</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>11</day>
          <month>12</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Warisijiang Kuerbanjiang, Shengzhe Peng, Yiershatijiang Jiamaliding, Yuexiong Yi. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 05.02.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e63626" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Cervical cancer remains the fourth leading cause of death among women globally, with a particularly severe burden in low-resource settings. A comprehensive approach—from screening to diagnosis and treatment—is essential for effective prevention and management. Large language models (LLMs) have emerged as potential tools to support health care, though their specific role in cervical cancer management remains underexplored.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to systematically evaluate the performance and interpretability of LLMs in cervical cancer management.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Models were selected from the AlpacaEval leaderboard version 2.0 and based on the capabilities of our computer. The questions inputted into the models cover aspects of general knowledge, screening, diagnosis, and treatment, according to guidelines. The prompt was developed using the Context, Objective, Style, Tone, Audience, and Response (CO-STAR) framework. Responses were evaluated for accuracy, guideline compliance, clarity, and practicality, graded as A, B, C, and D with corresponding scores of 3, 2, 1, and 0. The effective rate was calculated as the ratio of A and B responses to the total number of designed questions. Local Interpretable Model-Agnostic Explanations (LIME) was used to explain and enhance physicians’ trust in model outputs within the medical context.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Nine models were included in this study, and a set of 100 standardized questions covering general information, screening, diagnosis, and treatment was designed based on international and national guidelines. Seven models (ChatGPT-4.0 Turbo, Claude 2, Gemini Pro, Mistral-7B-v0.2, Starling-LM-7B alpha, HuatuoGPT, and BioMedLM 2.7B) provided stable responses. Among all the models included, ChatGPT-4.0 Turbo ranked first with a mean score of 2.67 (95% CI 2.54-2.80; effective rate 94.00%) with a prompt and 2.52 (95% CI 2.37-2.67; effective rate 87.00%) without a prompt, outperforming the other 8 models (<italic>P</italic>&#60;.001). Regardless of prompts, QiZhenGPT consistently ranked among the lowest-performing models, with <italic>P</italic>&#60;.01 in comparisons against all models except BioMedLM. Interpretability analysis showed that prompts improved alignment with human annotations for proprietary models (median intersection over union 0.43), while medical-specialized models exhibited limited improvement.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Proprietary LLMs, particularly ChatGPT-4.0 Turbo and Claude 2, show promise in clinical decision-making involving logical analysis. The use of prompts can enhance the accuracy of some models in cervical cancer management to varying degrees. Medical-specialized models, such as HuatuoGPT and BioMedLM, did not perform as well as expected in this study. By contrast, proprietary models, particularly those augmented with prompts, demonstrated notable accuracy and interpretability in medical tasks, such as cervical cancer management. However, this study underscores the need for further research to explore the practical application of LLMs in medical practice.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language model</kwd>
        <kwd>cervical cancer</kwd>
        <kwd>screening</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>model interpretability</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Cervical cancer is a significant global public health challenge, ranking fourth among all female cancers and remaining the leading cause of death in many low-income countries [<xref ref-type="bibr" rid="ref1">1</xref>]. In 2020, approximately 604,127 new cases and 341,831 deaths from cervical cancer were reported worldwide [<xref ref-type="bibr" rid="ref1">1</xref>]. Effective cervical cancer control necessitates an integrated approach that combines screening, accurate diagnosis, and personalized treatment to reduce morbidity and mortality. Despite a substantial decline in cervical cancer incidence in the United States since the introduction of screening [<xref ref-type="bibr" rid="ref2">2</xref>], up to 25% of women remain inadequately treated [<xref ref-type="bibr" rid="ref3">3</xref>], with even higher rates observed in resource-limited and developing countries [<xref ref-type="bibr" rid="ref1">1</xref>]. Moreover, precise diagnosis and appropriate treatment are essential for addressing abnormalities detected through screening, particularly to prevent disease progression and improve survival outcomes [<xref ref-type="bibr" rid="ref4">4</xref>]. Hence, strengthening these efforts is essential for reducing the global burden of cervical cancer and improving patient outcomes across diverse health care contexts.</p>
      <p>Large language models (LLMs), as cutting-edge technologies in artificial intelligence, are trained on vast data sets and enable a wide range of applications, from text polishing to complex problem-solving, thanks to their unprecedented natural language understanding capabilities. In the health care domain, LLMs hold the potential to revolutionize medical practices, including decision-making, patient management, and clinical data interpretation [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Notably, OpenAI’s proprietary LLMs, ChatGPT-3.5 and ChatGPT-4.0, have demonstrated high performance on the United States Medical Licensing Examination (USMLE), with ChatGPT-4.0 achieving particularly impressive results [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. Additionally, ChatGPT has shown competence across various medical fields, including surgery [<xref ref-type="bibr" rid="ref9">9</xref>], cardiology [<xref ref-type="bibr" rid="ref10">10</xref>], and plastic surgery [<xref ref-type="bibr" rid="ref11">11</xref>]. Compared with generic language models, medical-specialized models—fine-tuned on domain-specific data sets and subjected to specialized adjustments—have achieved equivalent or superior performance [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
      <p>To date, only a limited number of studies [<xref ref-type="bibr" rid="ref13">13</xref>] have applied LLMs to questions related to cervical cancer, as well as explainability analyses on either closed- or open-source LLMs to assess transparency and interpretability. The management of abnormal cervical cancer screening results, diagnosis, and treatment is a complex task that requires careful interpretation and follow-up [<xref ref-type="bibr" rid="ref14">14</xref>]. When deploying LLMs in cervical cancer management, it is crucial to evaluate their performance in managing abnormalities and to identify their strengths and limitations, particularly regarding model transparency and interpretability.</p>
      <p>In this study, we aim to compare the performance of current prevalent LLMs in cervical cancer management by evaluating their responses to a set of specifically designed questions (<xref rid="figure1" ref-type="fig">Figure 1</xref>). This research may provide valuable evidence to help clinicians manage screening results more effectively and accurately, particularly in regions with limited health care infrastructure.</p>
      <fig id="figure1" position="float">
        <label>Figure 1</label>
        <caption>
          <p>The flowchart of evaluation of LLMs' performance in cervical cancer management. Nine LLMs, including closed-source, open-source, and medical-specialized types, selected from AlpacaEval leaderboard were evaluated with 100 predefined questions derived from general inquiries and guidelines from ASCCP, CSCCP, FIGO, SEOM-GEICO, and CSCO guidelines. Responses were collected 3 times for each question, analyzed for semantic similarity, and reviewed by 2 experts for accuracy, effectiveness, and interpretability using LIME. ASCCP: American Society for Colposcopy and Cervical Pathology; CSCCP: Chinese Society for Colposcopy and Cervical Pathology of the China Healthy Birth Science Association; CSCO: Chinese Society of Clinical Oncology; FIGO: International Federation of Gynecology and Obstetrics; GEICO: Grupo Español de Investigación en Cáncer de Ovario; LIME: Local Interpretable Model-agnostic Explanations; LLM: large language model; SEOM: Sociedad Española de Oncología Médica.</p>
        </caption>
        <graphic xlink:href="jmir_v27i1e63626_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
      </fig>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Model Selection</title>
        <p>The AlpacaEval leaderboard is an automated system designed to evaluate language models based on their adherence to instructions, ranking them by comparing their responses to reference answers from top-performing models such as GPT-4. It aims to reduce biases, such as those related to output length. Unlike other leaderboards that may focus on a single type, this leaderboard includes both open- and closed-source models. The selection of potential models—whether closed-source, open-source, or medically specialized—is determined by their win rates on version 2.0 of the leaderboard, updated on March 3, 2024.</p>
        <p>For closed-source models, both free and paid versions are included, excluding those that are not publicly available or are in private beta. Open-source models are required to perform effectively on consumer-grade computers with standard configurations, given their potential use in resource-limited applications such as cervical cancer screening. The computer specifications for deploying these models are detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, with a maximum model size capacity of approximately 7 billion trainable parameters. The selection of medical-specialized models, which are limited in number on leaderboards, is informed by a study [<xref ref-type="bibr" rid="ref15">15</xref>] summarizing existing medical LLMs and their respective GitHub star counts. The performance of these medical LLMs is assessed based on the benchmark scores of their underlying models.</p>
      </sec>
      <sec>
        <title>Criteria for Question and Prompt Designing</title>
        <sec>
          <title>Questions Designing</title>
          <p>A comprehensive question set was developed to evaluate model performance, including general questions and those specifically focused on cervical cancer screening, diagnosis, and treatment. General questions were designed by our gynecological experts to address the most common queries about cervical cancer, covering essential, foundational information frequently encountered in clinical practice. Screening-related questions were crafted with reference to the Chinese Society for Colposcopy and Cervical Pathology of the China Healthy Birth Science Association (CSCCP) Consensus on cervical cancer screening and abnormal management in China [<xref ref-type="bibr" rid="ref16">16</xref>]. To ensure relevance and keep our questions up to date, we also incorporate the 2019 American Society for Colposcopy and Cervical Pathology (ASCCP) Risk-Based Management Consensus Guidelines for Abnormal Cervical Cancer Screening Tests and Cancer Precursors [<xref ref-type="bibr" rid="ref17">17</xref>]. The questions comprehensively address each clinical decision outlined in the CSCCP guideline flowcharts, as detailed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. Additional screening questions were developed based on the Chinese Society of Clinical Oncology (CSCO) Guidelines for the Diagnosis and Treatment of Cervical Cancer (2023) [<xref ref-type="bibr" rid="ref18">18</xref>]. The diagnosis and treatment questions were developed with reference to the Sociedad Española de Oncología Médica-Grupo Español de Investigación en Cáncer de Ovario (SEOM-GEICO) Clinical Guidelines on Cervical Cancer (2023) [<xref ref-type="bibr" rid="ref19">19</xref>], the CSCO Guidelines for the Diagnosis and Treatment of Cervical Cancer [<xref ref-type="bibr" rid="ref18">18</xref>], and The International Federation of Gynecology and Obstetrics (FIGO) 2018 Gynecologic Cancer Report – Interpretation of the Cervical Cancer Guidelines [<xref ref-type="bibr" rid="ref20">20</xref>]. The design was guided by the principles outlined in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>.</p>
          <boxed-text id="box1" position="float">
            <title>Principles guiding design.</title>
            <p>
              <bold>1. Diverse complexity levels</bold>
            </p>
            <list list-type="bullet">
              <list-item>
                <p>A combination of basic and advanced questions was included to evaluate the model’s ability to address both routine and complex clinical scenarios.</p>
              </list-item>
            </list>
            <p>
              <bold>2. Strict guideline adherence</bold>
            </p>
            <list list-type="bullet">
              <list-item>
                <p>Questions were structured to prioritize guideline-based knowledge, minimizing reliance on outdated or nonevidence-based practices.</p>
              </list-item>
            </list>
            <p>
              <bold>3. Primarily closed-ended format</bold>
            </p>
            <list list-type="bullet">
              <list-item>
                <p>Predominantly closed-ended questions were used to reduce subjective bias, with a few open-ended questions included to assess the model’s capacity for divergent medical problem-solving.</p>
              </list-item>
            </list>
            <p>
              <bold>4. Definitive answers</bold>
            </p>
            <list list-type="bullet">
              <list-item>
                <p>Each question was designed to have a clear, definitive answer.</p>
              </list-item>
            </list>
          </boxed-text>
          <p>These questions aim to evaluate the models’ understanding of clinical guidelines, their decision-making processes, and their ability to provide clear, actionable advice.</p>
        </sec>
        <sec>
          <title>Prompt Designing</title>
          <p>The prompt was designed using the Context, Objective, Style, Tone, Audience, and Response (CO-STAR) framework, which was ranked as the top prompt in the inaugural GPT-4 Prompt Engineering Competition. This framework was applied to guide the LLM in generating expert-level responses in gynecology, with a clear focus on defining the context, objective, style, tone, audience, and response format.</p>
        </sec>
      </sec>
      <sec>
        <title>Questioning Method</title>
        <p>Each designed question was sequentially input 3 times for each model, both with and without the designed prompt, to test consistency. The coherence of the responses was evaluated using semantic textual similarity [<xref ref-type="bibr" rid="ref21">21</xref>] by ChatGPT-3.5, the top-performing model on AlpacaEval, which was not used as a test model. If the semantics of all 3 responses are identical, they are sent back to the originating model to select the most suitable answer. In cases of discrepancies, a pairwise comparison is performed with scores ranging from 0 (not typical at all) to 100 (extremely typical) [<xref ref-type="bibr" rid="ref22">22</xref>]. The 2 responses with the highest similarity scores are returned to their model, which then selects the most appropriate answer (<xref rid="figure2" ref-type="fig">Figure 2</xref>).</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>The flowchart of final response determination for LLMs. Each question was tested 3 times per model, with and without prompts, to assess consistency. Responses were analyzed using STS by ChatGPT-3.5 (not included as a test model). If all 3 responses were semantically identical, the model selected the most suitable answer. For discrepancies, a pairwise comparison scored responses from 0 (Not typical at all) to 100 (Extremely typical), and the 2 highest-scoring responses were returned to the model to determine the final response. LLM: large language model; STS: semantic textual similarity.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e63626_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Scoring Process and Criteria</title>
        <p>Two gynecological experts independently and anonymously reviewed the responses to each question. If both experts agreed on a score, it was directly accepted; otherwise, they discussed it to determine the final score. Responses were evaluated based on accuracy, adherence to clinical guidelines, clarity of communication, and practicality. A scoring system, modified from a previous study [<xref ref-type="bibr" rid="ref23">23</xref>], was used to categorize responses into 4 grades: A, B, C, and D, to minimize subjective bias. Grades A and B were considered effective, and the model’s effective rate was calculated as follows:</p>
        <p>Effective rate = (<italic>N<sub>A</sub></italic>+<italic>N<sub>B</sub></italic>)/(<italic>N<sub>A</sub></italic>+<italic>N<sub>B</sub></italic>+<italic>N<sub>C</sub></italic>+<italic>N<sub>D</sub></italic>) × 100%</p>
        <p>where <italic>N</italic> represents the number of each grade. Scores are weighted at 3, 2, 1, and 0 points for statistical analysis (<xref ref-type="table" rid="table1">Table 1</xref>).</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Criteria of scoring for response.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="600"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>Grade</td>
                <td>Description</td>
                <td>Scores</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>A</td>
                <td>Completely correct with comprehensive information</td>
                <td>3</td>
              </tr>
              <tr valign="top">
                <td>B</td>
                <td>Mostly correct, but with missing information or minor errors</td>
                <td>2</td>
              </tr>
              <tr valign="top">
                <td>C</td>
                <td>Contains major errors but with some correct content</td>
                <td>1</td>
              </tr>
              <tr valign="top">
                <td>D</td>
                <td>Completely wrong or off-topic</td>
                <td> 0</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Explainability Analysis</title>
        <p>Local Interpretable Model-Agnostic Explanations (LIME) is widely recognized for generating locally interpretable explanations of machine learning model predictions, including natural language processing models [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. In this study, LIME was used to interpret LLM outputs by adapting methods previously successful in natural language processing. The primary LIME parameter, the number of samples, was set to 10 times the input sentence’s token count, based on preliminary experiments and prior applications of LIME to LLMs [<xref ref-type="bibr" rid="ref26">26</xref>]. Each input question was analyzed to identify key terms with assigned weights, and the top 5 key terms by weight were selected. Our experts manually annotated 5 key terms per question for comparison. An intersection-over-union (IoU) analysis was performed between the LIME-selected key terms and the expert-annotated key terms to evaluate their alignment (<xref rid="figure3" ref-type="fig">Figure 3</xref>).</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>The flowchart of model explainability analysis by the Local Interpretable Model-agnostic Explanations (LIME) methodology. LIME was used to interpret large language model (LLM) outputs by analyzing each input question and generating variant questions with or without prompt. The number of samples was set to 10 times the token count of the input sentence. Key terms were extracted using the Bio_ClinicalBERT model, and the top 5 terms by weight were selected. Experts manually annotated 5 key terms per question for comparison. An intersection over union (IoU) analysis was conducted to evaluate the alignment between LIME-selected and expert-annotated key terms. BERT: Bidirectional Encoder Representations from Transformers.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e63626_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>IoU(<italic>x</italic>1, <italic>x</italic>2) = (&#124;<italic>x</italic>1∩<italic>x</italic>2&#124;)/(&#124; <italic>x</italic>1∪<italic>x</italic>2&#124;)</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study did not involve human participants, identifiable patient data, or protected health information. The data utilized in this study comprised publicly available sources, including leaderboards, clinical guidelines, and secondary analyses of model-generated outputs. Therefore, an ethical review was not required under Zhongnan Hospital of Wuhan University’s secondary research policies. The study complied with the Declaration of Helsinki and institutional guidelines for secondary data use.</p>
      </sec>
      <sec>
        <title>Statistical Methods</title>
        <p>Analyses were conducted using R version 4.3.1 (R Foundation) and RStudio 2023.12.1+402 (R Foundation). Differences across models were assessed using the chi-square test for categorical variables. For paired comparisons, data were first tested for normality. If normally distributed, a paired <italic>t</italic> test was applied, with results reported as mean and SD; otherwise, a paired Wilcoxon rank sum test was used, with outcomes presented as median and IQR. Effective rates were reported as mean values with 95% CIs. A <italic>P</italic> value of less than .05 was considered indicative of a significant difference.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Model Selection</title>
        <p>After screening for win rates and conducting tests on our computers, our study included 9 models. The proprietary models are ChatGPT-4.0 Turbo, Claude 2, and Gemini Pro, which are accessible through their official websites. The open-source LLMs include Mistral-7B-v0.2, Starling-LM-7B Alpha, and Microsoft Phi-2. The medical-specialized models are the Chinese models HuatuoGPT and QiZhenGPT, along with the English model BioMedLM 2.7B. The expected performance ranking of the selected models is as follows: ChatGPT-4.0 Turbo &#62; Gemini Pro &#62; Claude 2 &#62; Mistral-7B-v0.2 &#62; Starling-LM-7B Alpha &#62; ChatGLM 6B (QiZhenGPT) &#62; Phi-2 &#62; Baichuan2-7B-Chat (HuatuoGPT). BioMedLM 2.7B is excluded from this ranking because it is not listed on the AlpacaEval Leaderboard. The characteristics of the included models are presented in <xref ref-type="table" rid="table2">Table 2</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>The characteristics of included models.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="150"/>
            <col width="650"/>
            <thead>
              <tr valign="top">
                <td>Model and access reference</td>
                <td>AlpacaEval win rate</td>
                <td>Description</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ChatGPT-4.0 Turbo [<xref ref-type="bibr" rid="ref27">27</xref>]</td>
                <td>50%</td>
                <td>Developed by OpenAI, ChatGPT-4.0 Turbo is an LLM<sup>a</sup> that is currently the most powerful in terms of performance.</td>
              </tr>
              <tr valign="top">
                <td>Claude 2 [<xref ref-type="bibr" rid="ref28">28</xref>]</td>
                <td>17.19%</td>
                <td>Developed by Anthropic, it is built on the GPT-3 architecture. This model features a context window of 100,000 ultra-long tokens, enabling it to handle longer context inputs efficiently.</td>
              </tr>
              <tr valign="top">
                <td>Mistral-7B-v0.2 [<xref ref-type="bibr" rid="ref29">29</xref>]</td>
                <td>14.72%</td>
                <td>Mistral-7B-v0.2 is the strongest open-source model on the list that can be deployed on consumer computers. Furthermore, the popularity of this model is high, as it received more than 700,000 downloads in January 2024.</td>
              </tr>
              <tr valign="top">
                <td>Starling-LM-7B alpha [<xref ref-type="bibr" rid="ref30">30</xref>]</td>
                <td>14.25%</td>
                <td>A fine-tuned model that outperforms all models to date on MT-Bench except for OpenAI’s GPT-4 and GPT-4 Turbo.</td>
              </tr>
              <tr valign="top">
                <td>Gemini Pro [<xref ref-type="bibr" rid="ref31">31</xref>]</td>
                <td>18.18%</td>
                <td>Developed by Google DeepMind, the more advanced Gemini Ultra is not yet available to the public, so we used the Pro version.</td>
              </tr>
              <tr valign="top">
                <td>HuatuoGPT 2-7B [<xref ref-type="bibr" rid="ref32">32</xref>]</td>
                <td>1.99% (base model)</td>
                <td>Developed by the Shenzhen Institute of Big Data and The Chinese University of Hong Kong, this Chinese medical LLM is fine-tuned based on Baichuan2-7B. Uses deploying method. The online demo is available at [<xref ref-type="bibr" rid="ref33">33</xref>].</td>
              </tr>
              <tr valign="top">
                <td>QiZhenGPT [<xref ref-type="bibr" rid="ref34">34</xref>]</td>
                <td>3.01% (base model)</td>
                <td>Released by Zhejiang University, the project includes 3 versions, each fine-tuned from the base models of ChatGLM-6B, Chinese-LLaMA-7B, and CaMA-13B.</td>
              </tr>
              <tr valign="top">
                <td>Phi-2 [<xref ref-type="bibr" rid="ref35">35</xref>]</td>
                <td>2.34%</td>
                <td>Released by Microsoft, this small language model has a data size of only 2.7 billion. Easy to deploy, even on consumer-grade computers, where it exhibits exceptionally fast response times.</td>
              </tr>
              <tr valign="top">
                <td>BioMedLM 2.7B [<xref ref-type="bibr" rid="ref36">36</xref>]</td>
                <td>N/A<sup>b</sup></td>
                <td>Previously known as PubMedGPT 2.7B, this model was developed through pretraining.</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>LLM: large language model.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>N/A: not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Questions and Prompts for LLMs</title>
        <p>The question set consisted of 100 questions designed to encompass a broad range of clinical scenarios commonly encountered in cervical cancer management. The first 22 questions focused on general knowledge, emphasizing foundational aspects frequently encountered in clinical gynecology. The next 40 questions addressed cervical cancer screening, aligning with the latest consensus guidelines and decision-making protocols. Subsequently, 6 and 32 questions covered diagnosis and treatment, respectively, offering a comprehensive evaluation of the models’ ability to interpret diagnostic criteria and recommend evidence-based treatment options. By including both routine and complex queries, the question set serves as a robust benchmark for assessing model performance, accuracy, and adherence to evidence-based medical practices. The complete list of questions is provided in <xref ref-type="table" rid="table3">Table 3</xref>.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>The 100 designed questions based on cervical cancer guidelines.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="80"/>
            <col width="920"/>
            <thead>
              <tr valign="top">
                <td>Category</td>
                <td>Questions</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Questions related to general knowledge</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>What are the risk factors that may necessitate cervical cancer screening?</p>
                    </list-item>
                    <list-item>
                      <p>At what age or under what conditions is cervical cancer screening typically deemed unnecessary?</p>
                    </list-item>
                    <list-item>
                      <p>What strategies are effective in reducing the risk of developing cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>What are the common clinical symptoms of cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>For individuals who have been vaccinated against HPV<sup>a</sup>, is it still necessary for them to undergo cervical cancer screening?</p>
                    </list-item>
                    <list-item>
                      <p>Is cervical cancer screening still recommended for individuals who have had only 1 sexual partner or are not currently sexually active?</p>
                    </list-item>
                    <list-item>
                      <p>What are the recommended intervals for cervical cancer screening, and do these intervals vary among different age groups?</p>
                    </list-item>
                    <list-item>
                      <p>Is cervical cancer screening universally recommended for all age groups? If not, what are the reasons for excluding certain age groups from undergoing cervical cancer screening?</p>
                    </list-item>
                    <list-item>
                      <p>How necessary is cervical cancer screening for women who have undergone total hysterectomy?</p>
                    </list-item>
                    <list-item>
                      <p>What is the significance of cervical cancer screening?</p>
                    </list-item>
                    <list-item>
                      <p>Why is the combined use of cytological screening (Papanicolaou test) and HPV testing not recommended for women aged 21-29 years?</p>
                    </list-item>
                    <list-item>
                      <p>Is there an invariable link between HPV infection and the onset of cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>How should one interpret an abnormal result from a cervical cancer screening test? Does such a result definitively indicate the presence of cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>Why is yearly cervical cancer screening not recommended?</p>
                    </list-item>
                    <list-item>
                      <p>What are the objectives of cervical cancer screening protocols?</p>
                    </list-item>
                    <list-item>
                      <p>Is cervical cancer hereditary? If so, should individuals with a familial history of cervical cancer be subject to more frequent screening protocols?</p>
                    </list-item>
                    <list-item>
                      <p>What’s the difference between a pelvic examination and a Pap test?</p>
                    </list-item>
                    <list-item>
                      <p>Can individuals independently administer HPV tests, and if so, how accurate are these self-administered tests?</p>
                    </list-item>
                    <list-item>
                      <p>Is it possible for cervical cancer to manifest within the interscreening interval, particularly between 2 consecutive cervical screening tests?</p>
                    </list-item>
                    <list-item>
                      <p>Is cervical screening necessary for individuals who have reached menopause?</p>
                    </list-item>
                    <list-item>
                      <p>Is it recommended to undergo cervical screening during pregnancy?</p>
                    </list-item>
                    <list-item>
                      <p>What procedures are typically involved in cervical cancer screening?</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Questions related to screening</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>What is the examination process for HPV-positive high-risk types?</p>
                    </list-item>
                    <list-item>
                      <p>When the initial screening shows positive HPV with high-risk HPV types, emphasizing that HPV is not typed, what is the next step in the examination?</p>
                    </list-item>
                    <list-item>
                      <p>When the initial screening shows positive HPV with high-risk HPV types, and the cytological examination result is negative, what does it indicate? Should regular check-ups follow? And if so, what should be the frequency?</p>
                    </list-item>
                    <list-item>
                      <p>When the initial screening shows positive HPV with high-risk HPV types, and the cytological examination result is ≥ASC-US<sup>b</sup>, should the next step be a colposcopy?</p>
                    </list-item>
                    <list-item>
                      <p>When the initial screening shows positive HPV with high-risk HPV types, typing identifies HPV16/18 positivity, should the next step be a colposcopy?</p>
                    </list-item>
                    <list-item>
                      <p>When the initial screening shows positive HPV with high-risk HPV types, and upon typing, it shows neither HPV16/18 positive but 1 of the other 12 types, what is the most likely subsequent examination?</p>
                    </list-item>
                    <list-item>
                      <p>When cytological examination indicates ASC-US as an abnormal initial screening result, what is the next step in the examination?</p>
                    </list-item>
                    <list-item>
                      <p>When cytological examination indicates ASC-US as an abnormal initial screening result, and HPV is used for triage, if HPV is positive, should a colposcopy follow?</p>
                    </list-item>
                    <list-item>
                      <p>When cytological examination indicates ASC-US as an abnormal initial screening result, and HPV is used for triage, if HPV is negative, what should be the subsequent examination?</p>
                    </list-item>
                    <list-item>
                      <p>When cytological examination results show ASC-H<sup>c</sup>, LSIL<sup>d</sup>, HSIL<sup>e</sup>, is a colposcopy needed next?</p>
                    </list-item>
                    <list-item>
                      <p>When cytological examination results show AGC<sup>f</sup>, what is the next examination required?</p>
                    </list-item>
                    <list-item>
                      <p>If both cytological and high-risk HPV joint tests show negative results, what other examination should follow?</p>
                    </list-item>
                    <list-item>
                      <p>If joint testing of cytology and high-risk HPV shows HPV negative and the cytological result is ASC-US, what should be done next?</p>
                    </list-item>
                    <list-item>
                      <p>If joint testing of cytology and high-risk HPV shows HPV negative, and the cytological result is &#62;ASC-US, what should be done next? Is a colposcopy required?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer detection, if joint testing of cytology and high-risk HPV shows HPV positive and the cytological result is ≥ASC-US, what specific examinations should follow?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer detection, if joint testing of cytology and high-risk HPV shows HPV positive and the cytological result is negative, what specific examinations should follow?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer detection, if joint testing of cytology and high-risk HPV shows HPV positive and the cytological result is negative, and the HPV typing is HPV16/18 positive, what specific examinations should follow?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer detection, if a patient has joint testing of cytology and high-risk HPV, with the results showing HPV positivity, a negative cytological result, and no typing for HPV16 or HPV18, are additional tests needed?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient has a histopathological confirmation (biopsy) of LSIL, cytology findings of LSIL or higher, and a TZ3<sup>g</sup> classification, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient has a histopathological confirmation (biopsy) of LSIL, cytology findings of LSIL or higher, and a TZ1/2 classification, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient has a histopathological confirmation (biopsy) of LSIL and cytology findings of ASC-H or higher, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient aged 21-24 years has a histopathological confirmation (biopsy) of LSIL and cytology findings of ASC-H or higher, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient aged 21-24 years has a histopathological confirmation (biopsy) of LSIL, cytology findings of ASC-H or higher, and a TZ3 classification for colposcopy, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient aged 21-24 years has a histopathological confirmation (biopsy) of LSIL, cytology findings of ASC-H or higher, and a TZ1/2 classification for colposcopy, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a pregnant patient has a histopathological confirmation (biopsy) of LSIL and cytology findings of ASC-H or higher, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient has a histopathological confirmation (biopsy) of HSIL and a TZ1/2 classification, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient has a histopathological confirmation (biopsy) of HSIL and a TZ3 classification, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient aged 21-24 years has a histopathological confirmation (biopsy) of CIN<sup>h</sup> III/HSIL, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient aged 21-24 years has a histopathological confirmation (biopsy) of HSIL and a TZ3 classification for colposcopy, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a patient aged 21-24 years has a histopathological confirmation (biopsy) of CIN II/III/HSIL or CIN II/HSIL and a TZ1/2 classification for colposcopy, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a pregnant patient has a histopathological confirmation (biopsy) of HSIL, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>After diagnostic/therapeutic cervical conization for cervical cancer, what follow-up steps should be conducted?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer screening, if a pregnant patient has a histopathological confirmation (biopsy) of HSIL without invasive cancer during pregnancy, what follow-up steps should be conducted after childbearing?</p>
                    </list-item>
                    <list-item>
                      <p>What are the strategies for HPV vaccine use? What are the recommended vaccination programs for different age groups?</p>
                    </list-item>
                    <list-item>
                      <p>Is HPV primary screening applicable in low-income countries? If so, why?</p>
                    </list-item>
                    <list-item>
                      <p>Which cervical cancer screening methods are widely used in low-income countries, particularly in sub-Saharan Africa?</p>
                    </list-item>
                    <list-item>
                      <p>What are the key indicators included in the expert consensus for quality control management of HPV testing?</p>
                    </list-item>
                    <list-item>
                      <p>What is the difference in clinical management between ASC-US and ASC-H?</p>
                    </list-item>
                    <list-item>
                      <p>Is 4-quadrant sampling still necessary for patients with no abnormalities on colposcopy?</p>
                    </list-item>
                    <list-item>
                      <p>What are the differences in cervical cancer screening and management strategies for women during pregnancy?</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Questions related to diagnosis</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>What is the preferred clinical diagnosis of cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>What tests should be conducted to make a pathologic diagnosis of cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>What tumor markers can be tested for laboratory diagnosis of cervical cancer? What is the significance of each tumor marker detected?</p>
                    </list-item>
                    <list-item>
                      <p>In the diagnosis of cervical cancer, which imaging method should be preferred to evaluate cervical tumors? Which imaging method should be used to evaluate metastatic lesions?</p>
                    </list-item>
                    <list-item>
                      <p>What are the recommended diagnostic tools for patients with FIGO<sup>i</sup> stage IA1 cervical cancer? Is it necessary to consider lymphovascular infiltration?</p>
                    </list-item>
                    <list-item>
                      <p>How are imaging tools used to assess tumor size and lymph node status in FIGO staging? What specific imaging tools are recommended?</p>
                    </list-item>
                  </list>
                </td>
              </tr>
              <tr valign="top">
                <td>Questions related to treatments</td>
                <td>
                  <list list-type="bullet">
                    <list-item>
                      <p>In cervical cancer treatment, if the patient does not wish to preserve fertility, and the stage is IA1 without lymphovascular space invasion, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient does not wish to preserve fertility, and the stage is IA1 with lymphovascular space invasion, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient does not wish to preserve fertility, and the stage is IA2, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient does not wish to preserve fertility, and the stage is IB1, IIA1, or IIB2, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient does not wish to preserve fertility, and the stage is IB3 or IIA2, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient wishes to preserve fertility, and the stage is IA1 without lymphovascular space invasion, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient wishes to preserve fertility, and the stage is IA1 with lymphovascular space invasion or IA2, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient wishes to preserve fertility, and the stage is IB1, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the patient wishes to preserve fertility, and the stage is IB2, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the stage is IIB, IIIA, or IIIB, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the stage is IIIC1, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the stage is IIIC2, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the stage is IVA without lymph node enlargement, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the stage is IVA with lymph node enlargement, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>In cervical cancer treatment, if the stage is IVB, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>After radical surgery for early cervical cancer, if the abdominal aortic lymph nodes are negative but high-risk factors are present, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>After radical surgery for early cervical cancer, if the abdominal aortic lymph nodes are negative but intermediate-risk factors are present, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>After radical surgery for early cervical cancer, if the abdominal aortic lymph nodes are positive but there is no distant metastasis, what treatment measures should be taken?</p>
                    </list-item>
                    <list-item>
                      <p>What are the surgical treatment options for patients with FIGO stage IA1 cervical cancer? Are they suitable for patients with preserved fertility?</p>
                    </list-item>
                    <list-item>
                      <p>In patients with FIGO stage IB2 and IIA1, what factors determine the choice between surgery and radiotherapy? What are the differences in outcomes between the 2 modalities?</p>
                    </list-item>
                    <list-item>
                      <p>What is the recommended treatment of choice for FIGO stage IB3 cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>What is the difference between the different types of radical hysterectomy? For which patients is it indicated?</p>
                    </list-item>
                    <list-item>
                      <p>What are the advantages of intensity-modulated radiation therapy in radiotherapy for cervical cancer?</p>
                    </list-item>
                    <list-item>
                      <p>What are the common sites of recurrence in cervical cancer? How is the risk of recurrence monitored?</p>
                    </list-item>
                    <list-item>
                      <p>What are the recommended treatment strategies for recurrent cervical cancer? Can surgery, radiotherapy, and chemotherapy be combined?</p>
                    </list-item>
                    <list-item>
                      <p>What are the treatment strategies for metastatic cervical cancer? Are there specific treatments at different metastatic sites?</p>
                    </list-item>
                    <list-item>
                      <p>What are the treatment strategies for patients with cervical cancer during pregnancy? How does the treatment differ in early, intermediate, and advanced stages of pregnancy?</p>
                    </list-item>
                    <list-item>
                      <p>Are patients with cervical cancer in pregnancy suitable for surgery? At what stage of pregnancy should surgery be considered?</p>
                    </list-item>
                    <list-item>
                      <p>What is the radiotherapy strategy for patients with locally advanced cervical cancer? When is a combination of chemotherapy recommended?</p>
                    </list-item>
                    <list-item>
                      <p>When is image-guided brachytherapy necessary and how is it different from conventional radiotherapy?</p>
                    </list-item>
                    <list-item>
                      <p>What are the methods for monitoring recurrence after treatment of cervical cancer? Is routine imaging recommended?</p>
                    </list-item>
                    <list-item>
                      <p>What are the main goals of palliative care? In patients with metastatic cervical cancer, how can palliative care be combined with radiotherapy and chemotherapy?</p>
                    </list-item>
                  </list>
                </td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>HPV: human papillomavirus.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>ASC-US: atypical squamous cells of undetermined significance.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>ASC-H: atypical squamous cells, cannot exclude high-grade squamous intraepithelial lesion</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>LSIL: low-grade squamous intraepithelial lesion.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>HSIL: high-grade squamous intraepithelial lesion.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>AGC: atypical glandular cell.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>TZ3: Type 3 transformation zone.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>CIN: cervical intraepithelial neoplasia.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>FIGO: International Federation of Gynecology and Obstetrics.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>Using the CO-STAR framework, the prompt was designed to guide the model in providing clinically relevant and detailed responses, meeting the standards necessary for accurate interpretation in cervical cancer management. The specific details of the prompt are presented in <xref ref-type="table" rid="table4">Table 4</xref>.</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Prompt designing based on the CO-STAR<sup>a</sup> framework.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="130"/>
            <col width="870"/>
            <thead>
              <tr valign="top">
                <td>Prompt element</td>
                <td>Content</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td># Context #</td>
                <td>Now you are a gynecologist with over 20 years of experience in medicine and you are answering questions about the medical specialty of cervical cancer treatment, diagnosis, and screening.</td>
              </tr>
              <tr valign="top">
                <td># Objective #</td>
                <td>Please answer the following questions correctly and in strict accordance with the latest guidelines for the screening, treatment, and diagnosis of cervical cancer.</td>
              </tr>
              <tr valign="top">
                <td># Style #</td>
                <td>The information should be clear, concise, and medically accurate, using terminology appropriate for both health care professionals and patients.</td>
              </tr>
              <tr valign="top">
                <td># Tone #</td>
                <td>The tone should be formal and professional, recognizing the sensitive nature of cancer-related discussions.</td>
              </tr>
              <tr valign="top">
                <td># Audience #</td>
                <td>The primary audience includes health care professionals, researchers, and patients seeking information about cervical cancer management.</td>
              </tr>
              <tr valign="top">
                <td># Response #</td>
                <td>Generate detailed responses to specific queries regarding cervical cancer. Assess the accuracy and relevance of the information provided.</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>CO-STAR: Context, Objective, Style, Tone, Audience, and Response.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Model Stability</title>
        <p>Among the 9 models evaluated, 7 demonstrated good reproducibility with stable responses. However, the repeatability of Phi-2 and QiZhenGPT was unsatisfactory, as posing the same question 3 times often resulted in varying answers. For Phi-2, 61 out of 100 responses with the prompt and 68 responses without the prompt exhibited semantic differences across repetitions. Similarly, for QiZhenGPT, 60 responses with the prompt and 55 without the prompt varied. In both cases, pairwise comparisons were necessary to determine the final output (see <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p>
      </sec>
      <sec>
        <title>Model Efficacy</title>
        <p>The evaluation results for each model, with and without the prompt, are presented in <xref rid="figure4" ref-type="fig">Figure 4</xref>. The top 3 performers were all proprietary models. ChatGPT-4.0 Turbo achieved the highest effective rate, at 94% (mean score 2.67, 95% CI 2.54-2.80) with the prompt and 87% (mean score 2.52, 95% CI 2.37-2.67) without it, highlighting the positive impact of the prompt on its performance. Claude 2 maintained an effective rate of 85% both with and without the prompt, with similar mean scores of 2.35 (95% CI 2.16-2.54) and 2.39 (95% CI 2.22-2.56), respectively. Gemini Pro showed moderate improvement, with its effective rate increasing from 66% (mean score 2.00, 95% CI 1.80-2.20) without the prompt to 77% (mean score 2.25, 95% CI 2.06-2.44) with the prompt.</p>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>The efficacy assessment of each model with and without the prompt. The number in A, B, C and D represents the distribution of response quality in each grade. ChatGPT-4.0 Turbo achieved the highest effective rate (94% with a mean score of 2.67, 87% without at 2.52), while Claude 2 remained consistent at 85%. Gemini Pro improved from 66% to 77% with prompts. Among medically specialized models, HuatuoGPT slightly increased from 53% to 57% without prompts, BioMedLM stayed low (39% vs 38%), and QiZhenGPT had the lowest rates (33% vs 32%), showing minimal impact from prompts.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e63626_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>By contrast, the 3 medically specialized models exhibited lower effective rates. HuatuoGPT achieved an effective rate of 53% (mean score 2.00, 95% CI 1.80-2.20) with the prompt, which unexpectedly increased to 57% (mean score 1.76, 95% CI 1.54-1.98) without it. BioMedLM showed minimal improvement, with an effective rate of 39% (mean score 1.13, 95% CI 0.90-1.36) with the prompt and 38% (mean score 1.76, 95% CI 1.54-1.98) without it. QiZhenGPT had the lowest performance, with an effective rate of 33% (mean score 1.13, 95% CI 0.91-1.35) with the prompt and 32% (mean score 1.19, 95% CI 0.97-1.41) without it, showing limited impact from the prompt on enhancing its responses. The STS testing results are provided in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. Detailed responses and original scoring are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p>
        <p>The chi-square test revealed significant differences across models (<italic>P</italic>=.001). As the data for each model did not follow a normal distribution (<italic>P</italic>&#60;.01), the Wilcoxon rank sum test was applied. With the prompt, ChatGPT-4.0 Turbo and Claude 2 exhibited highly significant differences (<italic>P</italic>&#60;.001) compared with most other models, indicating substantial performance enhancement when the prompt was used. This pattern remained consistent in comparisons with lower-performing models, such as HuatuoGPT, BioMedLM, and QiZhenGPT. Without the prompt, significant differences were still observed, particularly between high-performing models such as ChatGPT-4.0 Turbo (<italic>P</italic>&#60;.001) and Claude 2 (<italic>P</italic>&#60;.001) and lower-performing models. However, the absence of the prompt reduced significance in certain comparisons, such as between Mistral-7B and Gemini Pro (<italic>P</italic>=.30) or BioMedLM and QiZhenGPT (<italic>P</italic>=.64). When comparing performance with and without the prompt, ChatGPT-4.0 Turbo and Gemini Pro demonstrated statistically significant improvements with the prompt (<italic>P</italic>&#60;.001), whereas Claude 2 showed no significant difference (<italic>P</italic>=.07). By contrast, models such as BioMedLM (<italic>P</italic>=.77), Phi-2 (<italic>P</italic>=.53), and QiZhenGPT (<italic>P</italic>=.01) exhibited minimal or insignificant changes (<xref rid="figure5" ref-type="fig">Figure 5</xref>).</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Pairwise significance comparison between models with and without the prompt. The upper triangle represents significance levels between models with the prompt, while the lower triangle displays significance levels without the prompt. The diagonal section shows significance of performance differences within each model between the prompted and unprompted conditions. The Wilcoxon rank sum test was performed as the data for each model did not follow a normal distribution. ChatGPT-4.0 Turbo and Claude 2 showed significant improvements (<italic>P</italic>&#60;.001) with prompts, outperforming HuatuoGPT, BioMedLM, and QiZhenGPT. Without prompts, differences persisted but were less pronounced, especially between models such as Mistral-7B and Gemini Pro. insig.: insignificant; *<italic>P</italic>&#60;.05; **<italic>P</italic>&#60;.01; ***<italic>P</italic>&#60;.001.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e63626_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Model Explainability</title>
        <p>Given the nonnormal distribution of IoU values for each model, the Wilcoxon rank sum test was used to assess differences. As shown in <xref rid="figure6" ref-type="fig">Figure 6</xref>, the inclusion of prompts significantly improved the alignment between model-generated explanations and human annotations, with all models exhibiting statistically significant differences between prompted and unprompted conditions (<italic>P</italic>&#60;.001). Specifically, Claude 2, Gemini Pro, Starling-LM-7B Alpha, ChatGPT-4.0 Turbo, and Mistral-7B-v0.2 demonstrated a consistent median IoU of 0.43 with prompts. Among these, ChatGPT-4.0 Turbo had the widest IoU range (IQR 0.56). Without prompts, the median IoU for these models dropped to 0.25, with narrower IQRs ranging from 0.32 to 0.43, indicating reduced interpretability consistency. Among the medically specialized models, QiZhenGPT showed the most substantial improvement with prompts, achieving a median IoU of 0.43 (IQR 0.42), aligning it with the performance of proprietary models under similar conditions. By contrast, BioMedLM 2.7B and HuatuoGPT exhibited lower interpretability, with median IoUs of 0.29 and 0.25, respectively, and smaller IQRs in nonprompted conditions (median IoU of 0.11 and IQR of 0.25 for both).</p>
        <fig id="figure6" position="float">
          <label>Figure 6</label>
          <caption>
            <p>Model explainability analysis by intersection over union (IoU) for included models with or without the prompt. Data are expressed as median and IQR and the Wilcoxon rank sum test was applied due to nonnormal distribution within each model's data. Claude 2, Gemini Pro, Starling-LM-7B alpha, ChatGPT-4.0 Turbo, and Mistral-7B-v0.2 achieved a median IoU of 0.43 with prompts, dropping to 0.25 without prompt, with ChatGPT-4.0 Turbo showing the widest range (IQR 0.56). QiZhenGPT showed the most improvement among medical models with a median IoU of 0.43 with prompt. BioMedLM 2.7B and HuatuoGPT showed lower interpretability, with nonprompted IoUs of 0.11 (IQR 0.25).</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e63626_fig6.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study systematically evaluated 9 LLMs for their performance, stability, and interpretability in cervical cancer management. The results revealed that proprietary models, such as ChatGPT-4.0 Turbo, Claude 2, and Gemini Pro, achieved superior response accuracy and interpretability, particularly with prompt guidance. By contrast, medically specialized models such as HuatuoGPT, QiZhenGPT, and BioMedLM demonstrated comparatively lower effectiveness, with limited improvement from prompt use. Notably, while proprietary models exhibited consistent reproducibility, certain open-source and specialized models, such as Phi-2 and QiZhenGPT, showed variable responses upon repeated questioning. Furthermore, the use of prompts significantly enhanced interpretability in models such as Claude 2, Gemini Pro, and Starling-LM-7B Alpha, highlighting the potential of structured input to improve alignment with clinical expectations.</p>
      </sec>
      <sec>
        <title>Comparison to Prior Work</title>
        <p>In terms of average score ranking, proprietary models such as ChatGPT-4.0 Turbo, Claude 2, and Gemini Pro outperformed open-source models. This result aligns with traditional views on the superiority of proprietary systems [<xref ref-type="bibr" rid="ref37">37</xref>]. However, without the prompt, Mistral-7B outperformed Gemini Pro. Among the open-source models, Mistral-7B-v0.2 and Starling-LM-7B Alpha outperformed HuatuoGPT and BioMedLM 2.7B. However, the repeatability of answers from Microsoft Phi-2 was poor, making it unsuitable for medical applications, while ChatGPT-4.0 Turbo and Claude 2 provided accurate and consistent responses. Our results indicated that the performance of the 3 medical models was average, challenging the prevailing belief that medical-specific models are superior for medical queries [<xref ref-type="bibr" rid="ref38">38</xref>]. Previous studies [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref39">39</xref>] have shown that larger models, characterized by increased parameter counts, tend to perform better. Additionally, as the model scale increases, its generalization ability improves [<xref ref-type="bibr" rid="ref40">40</xref>]. This may explain the relative underperformance of medical models compared with proprietary models, given the substantial disparity in parameter magnitude between them.</p>
        <p>Recent advancements in algorithms have been shown to improve the performance of LLMs in the medical field [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref39">39</xref>], with research [<xref ref-type="bibr" rid="ref41">41</xref>] indicating significant accuracy improvements using specific prompts. The integration of prompts has had a notable impact on the performance of several LLMs, emphasizing the value of structured input in guiding model responses within clinical contexts. Proprietary models, such as ChatGPT-4.0 Turbo and Gemini Pro, showed marked improvements in effective rate and response accuracy when guided by the CO-STAR prompt framework, suggesting that structured prompts help enhance focus on relevant clinical information and reduce ambiguity [<xref ref-type="bibr" rid="ref42">42</xref>]. Conversely, models with specialized but limited training, such as BioMedLM, exhibited minimal sensitivity to prompts, likely due to architectural limitations in processing complex prompt structures [<xref ref-type="bibr" rid="ref43">43</xref>]. Interestingly, HuatuoGPT experienced a decline in performance with the addition of prompts. This unexpected outcome suggests that the structured prompt for HuatuoGPT may have interfered with its response generation by introducing constraints that conflicted with its training data or underlying language patterns, potentially limiting its ability to accurately interpret open-ended clinical scenarios [<xref ref-type="bibr" rid="ref44">44</xref>]. Additionally, smaller models often become confused when handling longer prompts [<xref ref-type="bibr" rid="ref45">45</xref>]. The variation in prompt effectiveness across models underscores that, while structured prompts generally improve response precision, their impact is influenced by the model design and data scope.</p>
        <p>The IoU serves as a robust indicator of alignment between model-generated explanations and human annotations, providing insights into the interpretability of LLMs in clinical contexts [<xref ref-type="bibr" rid="ref46">46</xref>]. A higher IoU reflects greater consistency with human-provided explanations, suggesting enhanced model transparency and reliability in decision-making support. Our results demonstrate that a higher IoU corresponds to better alignment between model-generated explanations and human annotations, indicating improved interpretability. Proprietary models, particularly ChatGPT-4.0 Turbo and Claude-2, performed well in aligning with human explanations when prompts were used, highlighting their potential for generating clinically relevant interpretations. Interestingly, the rankings for model explainability based on IoU scores do not directly correlate with those based on effective rates. This discrepancy likely arises because improvements in model performance do not necessarily enhance explainability [<xref ref-type="bibr" rid="ref47">47</xref>]. According to previous studies [<xref ref-type="bibr" rid="ref48">48</xref>], as models become more accurate, their alignment with human-annotated explanations does not necessarily improve. This misalignment suggests that the factors driving a model’s effectiveness in task accuracy differ from those contributing to explainability. Higher-performing models may rely on complex, implicit patterns that are not fully captured by metrics such as IoU, which primarily assess agreement with human logic rather than the model’s actual reasoning process [<xref ref-type="bibr" rid="ref49">49</xref>]. However, IoU alone may not fully capture explanation quality, as it can overlook aspects such as coherence and clinical relevance. Therefore, incorporating qualitative assessments alongside IoU could provide a more comprehensive measure of model explainability in clinical contexts.</p>
      </sec>
      <sec>
        <title>Ethical Issues</title>
        <p>LLMs have performed well in the cervical cancer question-and-answer task, but ethical considerations, such as transparency, data privacy, and algorithmic bias, remain [<xref ref-type="bibr" rid="ref50">50</xref>]. Tools such as LIME enhance transparency and simplify the explanation of AI decisions, with further progress expected [<xref ref-type="bibr" rid="ref51">51</xref>]. Deployments adhere to strict data laws to ensure ongoing improvements in privacy, and technological advancements are anticipated to further safeguard patient privacy [<xref ref-type="bibr" rid="ref52">52</xref>]. Bias issues are managed through explainable AI and methods such as training with multiple multiinstitutional or population data sets, as well as using generative adversarial networks to obtain more representative data [<xref ref-type="bibr" rid="ref53">53</xref>]. While practical challenges remain in technology integration and staff training, LLMs are more easily adopted due to their application programming interfaces and their ability to act as personalized learning assistants, reducing the reliance on extensive medical staff training [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>].</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>Our study also has limitations: (1) Because of the limited capabilities of our computers, we were unable to test all existing LLMs. It is possible that there are models with better performance than ChatGPT-4.0 Turbo in handling abnormal cervical screening results. (2) Our study did not include augmented algorithms or corpora that have been shown to improve LLM performance in other studies, as not all patients or physicians are familiar with these tools. The lack of these enhancements may limit the ability of LLMs to demonstrate their full potential in answering medical questions. This absence could have restricted the models from showcasing their full capabilities in medical query resolution, potentially affecting the generalizability of our results in more advanced settings. (3) The study conducted assessments under controlled, structured questions, which may not fully reflect the model’s performance in dynamic, real-world clinical settings. This controlled environment may limit our ability to assess the adaptability of LLMs in unpredictable or complex patient interactions.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>This study highlights the pivotal role of LLMs, particularly proprietary ones such as ChatGPT-4.0 Turbo, in enhancing clinical decision-making in cervical cancer screening. ChatGPT-4.0 Turbo outperforms both open-source and medical-specialized models in interpreting clinical guidelines and handling medical queries. Such findings are essential for improving the accuracy and efficiency of medical screenings and diagnoses, ultimately enhancing health care delivery and patient care. Further research is needed to assess the effectiveness of LLMs in medical applications, potentially leading to the development of models more tailored for medical practice and advancing overall health care.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Computer specifications for deploying models.</p>
        <media xlink:href="jmir_v27i1e63626_app1.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Cervical cancer screening and abnormal result management process by the CSCCP (Chinese Society for Colposcopy and Cervical Pathology of the China Healthy Birth Science Association).</p>
        <media xlink:href="jmir_v27i1e63626_app2.docx" xlink:title="DOCX File , 467 KB"/>
      </supplementary-material>
      <supplementary-material id="app3">
        <label>Multimedia Appendix 3</label>
        <p>Semantic textual similarity testing for Phi-2 and QiZhenGPT model.</p>
        <media xlink:href="jmir_v27i1e63626_app3.docx" xlink:title="DOCX File , 154 KB"/>
      </supplementary-material>
      <supplementary-material id="app4">
        <label>Multimedia Appendix 4</label>
        <p>Responses from each model and quality assessments.</p>
        <media xlink:href="jmir_v27i1e63626_app4.docx" xlink:title="DOCX File , 424 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">ASCCP</term>
          <def>
            <p>American Society for Colposcopy and Cervical Pathology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">CO-STAR</term>
          <def>
            <p>Context, Objective, Style, Tone, Audience, and Response</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">CSCCP</term>
          <def>
            <p>Chinese Society for Colposcopy and Cervical Pathology of the China Healthy Birth Science Association</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">CSCO</term>
          <def>
            <p>Chinese Society of Clinical Oncology</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">FIGO</term>
          <def>
            <p>International Federation of Gynecology and Obstetrics</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">GEICO</term>
          <def>
            <p>Grupo Español de Investigación en Cáncer de Ovario</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">IoU</term>
          <def>
            <p>intersection over union</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">LIME</term>
          <def>
            <p>Local Interpretable Model-agnostic Explanations</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">SEOM</term>
          <def>
            <p>Sociedad Española de Oncología Médica</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Science and Technology Innovation Cultivation Funding of Zhongnan Hospital of Wuhan University (grant CXPY2022049).</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>The 100 questions developed for model evaluation and all analyzed data in this study are included in the published manuscript and its multimedia appendices.</p>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sung</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ferlay</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Laversanne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Soerjomataram</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Jemal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bray</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Global Cancer Statistics 2020: GLOBOCAN estimates of incidence and mortality worldwide for 36 cancers in 185 countries</article-title>
          <source>CA Cancer J Clin</source>
          <year>2021</year>
          <month>05</month>
          <volume>71</volume>
          <issue>3</issue>
          <fpage>209</fpage>
          <lpage>249</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.3322/caac.21660"/>
          </comment>
          <pub-id pub-id-type="doi">10.3322/caac.21660</pub-id>
          <pub-id pub-id-type="medline">33538338</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cohen</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Wentzensen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Castle</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Schiffman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zuna</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Arend</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Clarke</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>Racial and ethnic disparities in cervical cancer incidence, survival, and mortality by histologic subtype</article-title>
          <source>J Clin Oncol</source>
          <year>2023</year>
          <month>02</month>
          <day>10</day>
          <volume>41</volume>
          <issue>5</issue>
          <fpage>1059</fpage>
          <lpage>1068</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36455190"/>
          </comment>
          <pub-id pub-id-type="doi">10.1200/JCO.22.01424</pub-id>
          <pub-id pub-id-type="medline">36455190</pub-id>
          <pub-id pub-id-type="pmcid">PMC9928618</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>KD</given-names>
            </name>
            <name name-style="western">
              <surname>Wagle</surname>
              <given-names>NS</given-names>
            </name>
            <name name-style="western">
              <surname>Jemal</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Cancer statistics, 2023</article-title>
          <source>CA Cancer J Clin</source>
          <year>2023</year>
          <month>01</month>
          <volume>73</volume>
          <issue>1</issue>
          <fpage>17</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://onlinelibrary.wiley.com/doi/10.3322/caac.21763"/>
          </comment>
          <pub-id pub-id-type="doi">10.3322/caac.21763</pub-id>
          <pub-id pub-id-type="medline">36633525</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Burmeister</surname>
              <given-names>CA</given-names>
            </name>
            <name name-style="western">
              <surname>Khan</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Schäfer</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mbatani</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Moodley</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Prince</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Cervical cancer therapies: current challenges and future perspectives</article-title>
          <source>Tumour Virus Res</source>
          <year>2022</year>
          <month>06</month>
          <volume>13</volume>
          <fpage>200238</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-6790(22)00004-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.tvr.2022.200238</pub-id>
          <pub-id pub-id-type="medline">35460940</pub-id>
          <pub-id pub-id-type="pii">S2666-6790(22)00004-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC9062473</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chavez</surname>
              <given-names>MR</given-names>
            </name>
            <name name-style="western">
              <surname>Butler</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Rekawek</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Heo</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Kinzler</surname>
              <given-names>WL</given-names>
            </name>
          </person-group>
          <article-title>Chat Generative Pre-trained Transformer: why we should embrace this technology</article-title>
          <source>Am J Obstet Gynecol</source>
          <year>2023</year>
          <month>06</month>
          <volume>228</volume>
          <issue>6</issue>
          <fpage>706</fpage>
          <lpage>711</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ajog.2023.03.010</pub-id>
          <pub-id pub-id-type="medline">36924908</pub-id>
          <pub-id pub-id-type="pii">S0002-9378(23)00155-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JK</given-names>
            </name>
            <name name-style="western">
              <surname>Chua</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rickard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lorenzo</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and large language model (LLM) chatbots: the current state of acceptability and a proposal for guidelines on utilization in academic medicine</article-title>
          <source>J Pediatr Urol</source>
          <year>2023</year>
          <month>06</month>
          <day>02</day>
          <fpage>598</fpage>
          <lpage>604</lpage>
          <pub-id pub-id-type="doi">10.1016/j.jpurol.2023.05.018</pub-id>
          <pub-id pub-id-type="medline">37328321</pub-id>
          <pub-id pub-id-type="pii">S1477-5131(23)00224-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>De Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <month>02</month>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Alfertshofer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Knoedler</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hoch</surname>
              <given-names>CC</given-names>
            </name>
            <name name-style="western">
              <surname>Funk</surname>
              <given-names>PF</given-names>
            </name>
            <name name-style="western">
              <surname>Cotofana</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Maheta</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Brébant</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Prantl</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lamby</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Pure wisdom or potemkin villages? A comparison of ChatGPT 3.5 and ChatGPT 4 on USMLE step 3 style questions: quantitative analysis</article-title>
          <source>JMIR Med Educ</source>
          <year>2024</year>
          <month>01</month>
          <day>05</day>
          <volume>10</volume>
          <fpage>e51148</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2024//e51148/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/51148</pub-id>
          <pub-id pub-id-type="medline">38180782</pub-id>
          <pub-id pub-id-type="pii">v10i1e51148</pub-id>
          <pub-id pub-id-type="pmcid">PMC10799278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Beaulieu-Jones</surname>
              <given-names>BR</given-names>
            </name>
            <name name-style="western">
              <surname>Berrigan</surname>
              <given-names>Margaret T</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>Sahaj</given-names>
            </name>
            <name name-style="western">
              <surname>Marwaha</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Lai</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brat</surname>
              <given-names>GA</given-names>
            </name>
          </person-group>
          <article-title>Evaluating capabilities of large language models: performance of GPT-4 on surgical knowledge assessments</article-title>
          <source>Surgery</source>
          <year>2024</year>
          <month>04</month>
          <volume>175</volume>
          <issue>4</issue>
          <fpage>936</fpage>
          <lpage>942</lpage>
          <pub-id pub-id-type="doi">10.1016/j.surg.2023.12.014</pub-id>
          <pub-id pub-id-type="medline">38246839</pub-id>
          <pub-id pub-id-type="pii">S0039-6060(23)00954-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC10947829</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Skalidis</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Cagnina</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Luangphiphat</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Mahendiran</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Muller</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Abbe</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fournier</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT takes on the European Exam in Core Cardiology: an artificial intelligence success story?</article-title>
          <source>Eur Heart J Digit Health</source>
          <year>2023</year>
          <month>05</month>
          <volume>4</volume>
          <issue>3</issue>
          <fpage>279</fpage>
          <lpage>281</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37265864"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ehjdh/ztad029</pub-id>
          <pub-id pub-id-type="medline">37265864</pub-id>
          <pub-id pub-id-type="pii">ztad029</pub-id>
          <pub-id pub-id-type="pmcid">PMC10232281</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Seth</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hunter-Smith</surname>
              <given-names>DJ</given-names>
            </name>
            <name name-style="western">
              <surname>Rozen</surname>
              <given-names>WM</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Aesthetic surgery advice and counseling from artificial intelligence: a rhinoplasty consultation with ChatGPT</article-title>
          <source>Aesthetic Plast Surg</source>
          <year>2023</year>
          <month>10</month>
          <volume>47</volume>
          <issue>5</issue>
          <fpage>1985</fpage>
          <lpage>1993</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37095384"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s00266-023-03338-7</pub-id>
          <pub-id pub-id-type="medline">37095384</pub-id>
          <pub-id pub-id-type="pii">10.1007/s00266-023-03338-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10581928</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Singhal</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Azizi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mahdavi</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>HW</given-names>
            </name>
            <name name-style="western">
              <surname>Scales</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Tanwani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Cole-Lewis</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Pfohl</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Payne</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Seneviratne</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gamble</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Babiker</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schärli</surname>
              <given-names>Nathanael</given-names>
            </name>
            <name name-style="western">
              <surname>Chowdhery</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mansfield</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Demner-Fushman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Agüera Y Arcas</surname>
              <given-names>Blaise</given-names>
            </name>
            <name name-style="western">
              <surname>Webster</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Corrado</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Matias</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gottweis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tomasev</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rajkomar</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Barral</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Semturs</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Karthikesalingam</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Natarajan</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Large language models encode clinical knowledge</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>08</month>
          <volume>620</volume>
          <issue>7972</issue>
          <fpage>172</fpage>
          <lpage>180</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37438534"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="medline">37438534</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41586-023-06291-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC10396962</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hermann</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Boyd</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Growdon</surname>
              <given-names>WB</given-names>
            </name>
            <name name-style="western">
              <surname>Aviki</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Stasenko</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Let's chat about cervical cancer: assessing the accuracy of ChatGPT responses to cervical cancer questions</article-title>
          <source>Gynecol Oncol</source>
          <year>2023</year>
          <month>12</month>
          <volume>179</volume>
          <fpage>164</fpage>
          <lpage>168</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ygyno.2023.11.008</pub-id>
          <pub-id pub-id-type="medline">37988948</pub-id>
          <pub-id pub-id-type="pii">S0090-8258(23)01535-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Stonehocker</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Cervical cancer screening in pregnancy</article-title>
          <source>Obstet Gynecol Clin North Am</source>
          <year>2013</year>
          <month>06</month>
          <volume>40</volume>
          <issue>2</issue>
          <fpage>269</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ogc.2013.03.005</pub-id>
          <pub-id pub-id-type="medline">23732031</pub-id>
          <pub-id pub-id-type="pii">S0889-8545(13)00030-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Gu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hua</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Mao</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>You</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Clifton</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Clifton</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A survey of large language models in medicine: progress, application, and challenge</article-title>
          <source>arXiv</source>
          <fpage>e1</fpage>
          <comment>Preprint posted online on July 22, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2311.05112"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2311.05112</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Expert consensus interpretation on Chinese cervical cancer screening and abnormal management issues by CSCCP</article-title>
          <source>Journal of Practical Obstetrics and Gynecology</source>
          <year>2018</year>
          <month>02</month>
          <day>15</day>
          <volume>34</volume>
          <issue>2</issue>
          <fpage>101</fpage>
          <lpage>104</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://kns.cnki.net/kcms2/article/abstract?v=1i9K2Vab0pNUjTXHLCgyumvmZ3mYpDxl2Z99yqiGtJwWltZXFA8nlGpbBnyCcZ2mCl9WPNj73SUfOIRFPNBh0YOo_DMGEW5HYeD9lcyoiXXM-PJtQosMyGEwVV0gKQxAtKamSjmB9ZRnDsrF7DCOUsbny8tNCbSKyKzXbY3xDod8B_Vb5X2kOg1grMjcQ74890FytoOOqIY=&#38;uniplatform=NZKPT&#38;language=CHS"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Perkins</surname>
              <given-names>RB</given-names>
            </name>
            <name name-style="western">
              <surname>Guido</surname>
              <given-names>RS</given-names>
            </name>
            <name name-style="western">
              <surname>Castle</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Chelmow</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Einstein</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Huh</surname>
              <given-names>WK</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Moscicki</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nayar</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Saraiya</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sawaya</surname>
              <given-names>GF</given-names>
            </name>
            <name name-style="western">
              <surname>Wentzensen</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Schiffman</surname>
              <given-names>M</given-names>
            </name>
            <collab>2019 ASCCP Risk-Based Management Consensus Guidelines Committee</collab>
          </person-group>
          <article-title>2019 ASCCP risk-based management consensus guidelines for abnormal cervical cancer screening tests and cancer precursors</article-title>
          <source>J Low Genit Tract Dis</source>
          <year>2020</year>
          <month>04</month>
          <volume>24</volume>
          <issue>2</issue>
          <fpage>102</fpage>
          <lpage>131</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32243307"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/LGT.0000000000000525</pub-id>
          <pub-id pub-id-type="medline">32243307</pub-id>
          <pub-id pub-id-type="pii">00128360-202004000-00003</pub-id>
          <pub-id pub-id-type="pmcid">PMC7147428</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lou</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Q</given-names>
            </name>
          </person-group>
          <source>Chinese Society of Clinical Oncology (CSCO) Guidelines for the Diagnosis and Treatment of Cervical Cancer (2023)</source>
          <year>2023</year>
          <month>08</month>
          <day>01</day>
          <publisher-loc>Beijing, China</publisher-loc>
          <publisher-name>People's Medical Publishing House</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Manso</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ramchandani-Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Romero</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Sánchez-Lorenzo</surname>
              <given-names>Luisa</given-names>
            </name>
            <name name-style="western">
              <surname>Bermejo-Pérez</surname>
              <given-names>María José</given-names>
            </name>
            <name name-style="western">
              <surname>Estévez-García</surname>
              <given-names>Purificación</given-names>
            </name>
            <name name-style="western">
              <surname>Fariña-Madrid</surname>
              <given-names>Lorena</given-names>
            </name>
            <name name-style="western">
              <surname>García García</surname>
              <given-names>Yolanda</given-names>
            </name>
            <name name-style="western">
              <surname>Gil-Martin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Quindós</surname>
              <given-names>María</given-names>
            </name>
          </person-group>
          <article-title>SEOM-GEICO clinical guidelines on cervical cancer (2023)</article-title>
          <source>Clin Transl Oncol</source>
          <year>2024</year>
          <month>11</month>
          <volume>26</volume>
          <issue>11</issue>
          <fpage>2771</fpage>
          <lpage>2782</lpage>
          <pub-id pub-id-type="doi">10.1007/s12094-024-03604-3</pub-id>
          <pub-id pub-id-type="medline">39215938</pub-id>
          <pub-id pub-id-type="pii">10.1007/s12094-024-03604-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC11466906</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Luo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>Z</given-names>
            </name>
          </person-group>
          <article-title>FIGO 2018 gynecologic cancer report-interpretation of the cervical cancer guidelines</article-title>
          <source>Zhongguo Shiyong Fuke Yu Chanke Zazhi</source>
          <year>2019</year>
          <month>02</month>
          <day>19</day>
          <volume>35</volume>
          <issue>1</issue>
          <fpage>95</fpage>
          <lpage>103</lpage>
          <pub-id pub-id-type="doi">10.19538/j.fk2019010123</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Luan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zhuang</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Scaling sentence embeddings with large language models</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on July 31, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2307.16645"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2307.16645</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Le Mens</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Kovács</surname>
              <given-names>Balázs</given-names>
            </name>
            <name name-style="western">
              <surname>Hannan</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Pros</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Uncovering the semantics of concepts using GPT-4</article-title>
          <source>Proc Natl Acad Sci U S A</source>
          <year>2023</year>
          <month>12</month>
          <day>05</day>
          <volume>120</volume>
          <issue>49</issue>
          <fpage>e2309350120</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.pnas.org/doi/abs/10.1073/pnas.2309350120?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.1073/pnas.2309350120</pub-id>
          <pub-id pub-id-type="medline">38032930</pub-id>
          <pub-id pub-id-type="pmcid">PMC10710071</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lozano</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fleming</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Chiang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Shah</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Clinfo.ai: an open-source retrieval-augmented large language model system for answering medical questions using scientific literature</article-title>
          <source>Pac Symp Biocomput</source>
          <year>2024</year>
          <volume>29</volume>
          <fpage>8</fpage>
          <lpage>23</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://psb.stanford.edu/psb-online/proceedings/psb24/abstracts/2024_p8.html"/>
          </comment>
          <pub-id pub-id-type="medline">38160266</pub-id>
          <pub-id pub-id-type="pii">9789811286421_0002</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Barr</surname>
              <given-names>KN</given-names>
            </name>
            <name name-style="western">
              <surname>Blomberg</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Saraiva</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Papapetrou</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Evaluating local interpretable model-agnostic explanations on clinical machine learning classification models</article-title>
          <year>2020</year>
          <month>07</month>
          <day>28</day>
          <conf-name>2020 IEEE 33rd International Symposium on Computer-Based Medical Systems (CBMS)</conf-name>
          <conf-date>July 28-30, 2020</conf-date>
          <conf-loc>Rochester, MN, USA</conf-loc>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>IEEE</publisher-name>
          <pub-id pub-id-type="doi">10.1109/cbms49503.2020.00009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Madsen</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chandar</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Post-hoc interpretability for neural NLP: a survey</article-title>
          <source>ACM Comput Surv</source>
          <year>2022</year>
          <month>12</month>
          <day>23</day>
          <volume>55</volume>
          <issue>8</issue>
          <fpage>1</fpage>
          <lpage>42</lpage>
          <pub-id pub-id-type="doi">10.1145/3546577</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mamidanna</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Jangam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Gilpin</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Can large language models explain themselves? A study of LLM-generated self-explanations</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on October 17, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2310.11207"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2310.11207</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="web">
          <source>OpenAI</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://chat.openai.com/">https://chat.openai.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="web">
          <source>Claude</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://claude.ai/">https://claude.ai/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="web">
          <source>Hugging Face</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2">https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="web">
          <source>Hugging Face</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha">https://huggingface.co/berkeley-nest/Starling-LM-7B-alpha</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="web">
          <source>Google Gemini</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://gemini.google.com/">https://gemini.google.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/FreedomIntelligence/HuatuoGPT?tab=readme-ov-file">https://github.com/FreedomIntelligence/HuatuoGPT?tab=readme-ov-file</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="web">
          <source>HuatuoGPT</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.huatuogpt.cn/#/">https://www.huatuogpt.cn/#/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="web">
          <source>GitHub</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://github.com/CMKRG/QiZhenGPT?tab=readme-ov-file">https://github.com/CMKRG/QiZhenGPT?tab=readme-ov-file</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <source>Hugging Face</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/microsoft/phi-2">https://huggingface.co/microsoft/phi-2</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <source>Hugging Face</source>
          <access-date>2025-01-10</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://huggingface.co/stanford-crfm/BioMedLM">https://huggingface.co/stanford-crfm/BioMedLM</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Jiao</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Ravaut</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Xiong</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Joty</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT's one-year anniversary: are open-source large language models catching up?</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on January 15, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2311.16989"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2311.16989</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tsoutsanis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Tsoutsanis</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Evaluation of large language model performance on the Multi-Specialty Recruitment Assessment (MSRA) exam</article-title>
          <source>Comput Biol Med</source>
          <year>2024</year>
          <month>01</month>
          <volume>168</volume>
          <fpage>107794</fpage>
          <pub-id pub-id-type="doi">10.1016/j.compbiomed.2023.107794</pub-id>
          <pub-id pub-id-type="medline">38043471</pub-id>
          <pub-id pub-id-type="pii">S0010-4825(23)01259-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Xiao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>One LLM is not enough: harnessing the power of ensemble learning for medical question answering</article-title>
          <source>medRxiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online December 24, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38196648"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2023.12.21.23300380</pub-id>
          <pub-id pub-id-type="medline">38196648</pub-id>
          <pub-id pub-id-type="pii">2023.12.21.23300380</pub-id>
          <pub-id pub-id-type="pmcid">PMC10775333</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McCandlish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Henighan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Chess</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Amodei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Scaling laws for neural language models</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on January 23, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2001.08361"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liévin</surname>
              <given-names>Valentin</given-names>
            </name>
            <name name-style="western">
              <surname>Hother</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Motzfeldt</surname>
              <given-names>AG</given-names>
            </name>
            <name name-style="western">
              <surname>Winther</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Can large language models reason about medical questions?</article-title>
          <source>Patterns (N Y)</source>
          <year>2024</year>
          <month>03</month>
          <day>08</day>
          <volume>5</volume>
          <issue>3</issue>
          <fpage>100943</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-3899(24)00042-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.patter.2024.100943</pub-id>
          <pub-id pub-id-type="medline">38487804</pub-id>
          <pub-id pub-id-type="pii">S2666-3899(24)00042-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC10935498</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sivarajkumar</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Samolyk-Mazzanti</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Visweswaran</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>An empirical evaluation of prompting strategies for large language models in zero-shot clinical natural language processing: algorithm development and validation study</article-title>
          <source>JMIR Med Inform</source>
          <year>2024</year>
          <month>04</month>
          <day>08</day>
          <volume>12</volume>
          <fpage>e55318</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2024//e55318/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/55318</pub-id>
          <pub-id pub-id-type="medline">38587879</pub-id>
          <pub-id pub-id-type="pii">v12i1e55318</pub-id>
          <pub-id pub-id-type="pmcid">PMC11036183</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Langrené</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Zhu</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Unleashing the potential of prompt engineering in large language models: a comprehensive review</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on September 5, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2310.14735"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2310.14735</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Bounding the capabilities of large language models in open text generation with prompt constraints</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on February 17, 2023</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2302.09185"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2023.findings-eacl.148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lester</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Rfou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Constant</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>The power of scale for parameter-efficient prompt tuning</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on September 2, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.08691"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alvarez-Melis</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kaur</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wallach</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Vaughan</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>From human explanation to model interpretability: a framework based on weight of evidence</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on September 20, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2104.13299"/>
          </comment>
          <pub-id pub-id-type="doi">10.1609/hcomp.v9i1.18938</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lazebnik</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Bunimovich-Mendrazitsky</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rosenfeld</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>An algorithm to optimize explainability using feature ensembles</article-title>
          <source>Appl Intell</source>
          <year>2024</year>
          <month>02</month>
          <day>01</day>
          <volume>54</volume>
          <issue>2</issue>
          <fpage>2248</fpage>
          <lpage>2260</lpage>
          <pub-id pub-id-type="doi">10.1007/s10489-023-05069-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Heyen</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Widdicombe</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Siegel</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Perez-Ortiz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Treleaven</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>The effect of model size on LLM post-hoc explainability via LIME</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on May 8, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://arxiv.org/abs/2405.05348"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chakraborty</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tomsett</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Raghavendra</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Harborne</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Alzantot</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cerutti</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Srivastava</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Preece</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Julier</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kelley</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Braines</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sensoy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Willis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gurram</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Interpretability of deep learning models: a survey of results</article-title>
          <year>2018</year>
          <month>06</month>
          <day>28</day>
          <conf-name>2017 IEEE SmartWorld, Ubiquitous Intelligence &#38; Computing, Advanced &#38; Trusted Computed, Scalable Computing &#38; Communications, Cloud &#38; Big Data Computing, Internet of People and Smart City Innovation (SmartWorld/SCALCOM/UIC/ATC/CBDCom/IOP/SCI)</conf-name>
          <conf-date>August 4-8, 2017</conf-date>
          <conf-loc>San Francisco, CA, USA</conf-loc>
          <pub-id pub-id-type="doi">10.1109/uic-atc.2017.8397411</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhui</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fenghe</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Xuehu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Qining</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Wei</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Ethical considerations and fundamental principles of large language models in medical education: viewpoint</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <month>08</month>
          <day>01</day>
          <volume>26</volume>
          <fpage>e60083</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e60083/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/60083</pub-id>
          <pub-id pub-id-type="medline">38971715</pub-id>
          <pub-id pub-id-type="pii">v26i1e60083</pub-id>
          <pub-id pub-id-type="pmcid">PMC11327620</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dong</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Lin</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Rajbahadur</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hassan</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>PromptExp: multi-granularity prompt explanation of large language models</article-title>
          <source>arXiv</source>
          <fpage>1</fpage>
          <comment>Preprint posted online on October 30, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2410.13073"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhan</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Seymour</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Such</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Beyond individual concerns: multi-user privacy in large language models</article-title>
          <year>2024</year>
          <month>07</month>
          <day>08</day>
          <conf-name>CUI '24: Proceedings of the 6th ACM Conference on Conversational User Interfaces</conf-name>
          <conf-date>July 8-10, 2024</conf-date>
          <conf-loc>Luxembourg City, Luxembourg</conf-loc>
          <pub-id pub-id-type="doi">10.1145/3640794.3665883</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ong</surname>
              <given-names>JCL</given-names>
            </name>
            <name name-style="western">
              <surname>Seng</surname>
              <given-names>BJJ</given-names>
            </name>
            <name name-style="western">
              <surname>Law</surname>
              <given-names>JZF</given-names>
            </name>
            <name name-style="western">
              <surname>Low</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Kwa</surname>
              <given-names>ALH</given-names>
            </name>
            <name name-style="western">
              <surname>Giacomini</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Ting</surname>
              <given-names>DSW</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence, ChatGPT, and other large language models for social determinants of health: current state and future directions</article-title>
          <source>Cell Rep Med</source>
          <year>2024</year>
          <month>01</month>
          <day>16</day>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>101356</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-3791(23)00573-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.xcrm.2023.101356</pub-id>
          <pub-id pub-id-type="medline">38232690</pub-id>
          <pub-id pub-id-type="pii">S2666-3791(23)00573-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC10829781</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hart</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffman</surname>
              <given-names>NG</given-names>
            </name>
            <name name-style="western">
              <surname>Gershkovich</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Christenson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>McClintock</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>LJ</given-names>
            </name>
            <name name-style="western">
              <surname>Jackups</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Azimi</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Spies</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Brodsky</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Organizational preparedness for the use of large language models in pathology informatics</article-title>
          <source>J Pathol Inform</source>
          <year>2023</year>
          <volume>14</volume>
          <fpage>100338</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2153-3539(23)00152-9"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jpi.2023.100338</pub-id>
          <pub-id pub-id-type="medline">37860713</pub-id>
          <pub-id pub-id-type="pii">S2153-3539(23)00152-9</pub-id>
          <pub-id pub-id-type="pmcid">PMC10582733</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Su</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Qiao</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Dai</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Based on medicine, the now and future of large language models</article-title>
          <source>Cell Mol Bioeng</source>
          <year>2024</year>
          <month>08</month>
          <day>16</day>
          <volume>17</volume>
          <issue>4</issue>
          <fpage>263</fpage>
          <lpage>277</lpage>
          <pub-id pub-id-type="doi">10.1007/s12195-024-00820-3</pub-id>
          <pub-id pub-id-type="medline">39372551</pub-id>
          <pub-id pub-id-type="pii">820</pub-id>
          <pub-id pub-id-type="pmcid">PMC11450117</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
