<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e64348</article-id><article-id pub-id-type="doi">10.2196/64348</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Token Probabilities to Mitigate Large Language Models Overconfidence in Answering Medical Questions: Quantitative Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Bentegeac</surname><given-names>Rapha&#x00EB;l</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Le Guellec</surname><given-names>Bastien</given-names></name><degrees>MS, MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuchcinski</surname><given-names>Gr&#x00E9;gory</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Amouyel</surname><given-names>Philippe</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Hamroun</surname><given-names>Aghiles</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff5">5</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Public Health, Lille University, Lille University Hospital Center</institution><addr-line>avenue du Professeur Emile Laine</addr-line><addr-line>Lille</addr-line><country>France</country></aff><aff id="aff2"><institution>Univ. Lille, Inserm, Centre Hosp. Univ Lille, Institut Pasteur de Lille, UMR1167&#x2014;Labex DISTALZ&#x2014;RID-AGE&#x2014;Risk Factors and Molecular Determinants of Aging-Related Diseases</institution><addr-line>Lille</addr-line><country>France</country></aff><aff id="aff3"><institution>Department of Neuroradiology, Lille University, Lille University Hospital Center</institution><addr-line>Lille</addr-line><country>France</country></aff><aff id="aff4"><institution>Univ. Lille, Inserm, CHU Lille, U1172&#x2014;LilNCog&#x2014;Lille Neuroscience &#x0026; Cognition</institution><addr-line>Lille</addr-line><country>France</country></aff><aff id="aff5"><institution>Univ. Lille, CNRS, Inserm, CHU Lille, Institut Pasteur de Lille, US 41 - UAR 2014 - PLBS</institution><addr-line>Lille</addr-line><country>France</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Xie</surname><given-names>Feng</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ozek</surname><given-names>Burcu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Chenxu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Choi</surname><given-names>Donghee</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Jiang</surname><given-names>Yixing</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Aghiles Hamroun, MD, PhD, Department of Public Health, Lille University, Lille University Hospital Center, avenue du Professeur Emile Laine, Lille, 59037, France; <email>aghiles.hamroun@univ-lille.fr</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>29</day><month>8</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e64348</elocation-id><history><date date-type="received"><day>15</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>09</day><month>05</month><year>2025</year></date><date date-type="accepted"><day>01</day><month>07</month><year>2025</year></date></history><copyright-statement>&#x00A9; Rapha&#x00EB;l Bentegeac, Bastien Le Guellec, Gr&#x00E9;gory Kuchcinski, Philippe Amouyel, Aghiles Hamroun. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 29.8.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e64348"/><abstract><sec><title>Background</title><p>Chatbots have demonstrated promising capabilities in medicine, scoring passing grades for board examinations across various specialties. However, their tendency to express high levels of confidence in their responses, even when incorrect, poses a limitation to their utility in clinical settings.</p></sec><sec><title>Objective</title><p>The aim of the study is to examine whether token probabilities outperform chatbots&#x2019; expressed confidence levels in predicting the accuracy of their responses to medical questions.</p></sec><sec sec-type="methods"><title>Methods</title><p>In total, 9 large language models, comprising both commercial (GPT-3.5, GPT-4, and GPT-4o) and open source (Llama 3.1-8b, Llama 3.1-70b, Phi-3-Mini, Phi-3-Medium, Gemma 2-9b, and Gemma 2-27b), were prompted to respond to a set of 2522 questions from the United States Medical Licensing Examination (MedQA database). Additionally, the models rated their confidence from 0 to 100, and the token probability of each response was extracted. The models&#x2019; success rates were measured, and the predictive performances of both expressed confidence and response token probability in predicting response accuracy were evaluated using area under the receiver operating characteristic curves (AUROCs), adapted calibration error, and Brier score. Sensitivity analyses were conducted using additional questions sourced from other databases in English (MedMCQA: n=2797), Chinese (MedQA Mainland China: n=3413 and Taiwan: n=2808), and French (FrMedMCQA: n=1079), different prompting strategies, and temperature settings.</p></sec><sec sec-type="results"><title>Results</title><p>Overall, mean accuracy ranged from 56.5% (95% CI 54.6&#x2010;58.5) for Phi-3-Mini to 89% (95% CI 87.7&#x2010;90.2) for GPT-4o. Across the United States Medical Licensing Examination questions, all chatbots consistently expressed high levels of confidence in their responses (ranging from 90, 95% CI 90-90 for Llama 3.1-70b to 100, 95% CI 100-100 for GPT-3.5). However, expressed confidence failed to predict response accuracy (AUROC ranging from 0.52, 95% CI 0.50&#x2010;0.53 for Phi-3-Mini to 0.68, 95% CI 0.65&#x2010;0.71 for GPT-4o). In contrast, the response token probability consistently outperformed expressed confidence for predicting response accuracy (AUROCs ranging from 0.71, 95% CI 0.69&#x2010;0.73 for Phi-3-Mini to 0.87, 95% CI 0.85&#x2010;0.89 for GPT-4o; all <italic>P</italic>&#x003C;.001). Furthermore, all models demonstrated imperfect calibration, with a general trend toward overconfidence. These findings were consistent in sensitivity analyses.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Due to the limited capacity of chatbots to accurately evaluate their confidence when responding to medical queries, clinicians and patients should abstain from relying on their self-rated certainty. Instead, token probabilities emerge as a promising and easily accessible alternative for gauging the inner doubts of these models.</p></sec></abstract><kwd-group><kwd>ChatGPT</kwd><kwd>large language model</kwd><kwd>chatbot</kwd><kwd>confidence</kwd><kwd>token probability</kwd><kwd>natural language processing</kwd><kwd>NLP</kwd><kwd>machine learning</kwd><kwd>artificial intelligence</kwd><kwd>language model</kwd><kwd>token</kwd><kwd>probability</kwd><kwd>medical question</kwd><kwd>questionnaire</kwd><kwd>medicine</kwd><kwd>MedQA</kwd><kwd>accuracy</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The potential of large language models (LLMs) for enhancing clinical workflows and facilitating communication with patients is becoming increasingly evident [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. Since the public release of OpenAI&#x2019;s ChatGPT in 2023, use cases in medicine have flourished, from extracting information from large volumes of documents [<xref ref-type="bibr" rid="ref4">4</xref>] to answering questions of patients [<xref ref-type="bibr" rid="ref5">5</xref>]. While some models are getting bigger and more capable (OpenAI&#x2019;s o1, Google&#x2019;s Gemini 2.0, and Anthropic&#x2019;s Claude), others are focusing on data privacy and portability (Mistral Small, Meta Llama, and Microsoft Phi Mini). Their intuitive use combined with their ability to score passing grades on several board examinations including the United States Medical Licensing Examination (USMLE) is building trust in their safe deployment as medical assistants for both physicians and patients [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref10">10</xref>]. However, relying extensively on chatbots for health-related guidance carries the risk of misinformation and subsequent health hazards [<xref ref-type="bibr" rid="ref11">11</xref>], as they may generate inaccurate information, hallucinate, and lack robustness [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Therefore, their use in high-stakes settings such as answering to patients&#x2019; inquiries requires a careful evaluation of both their knowledge and their ability to clearly express uncertainty [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Yet, research indicates that these models systematically express high levels of confidence in their answers, irrespective of their actual correctness [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. This inability to communicate hesitations is potentially misleading for both health care professionals and patients [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. Currently, developing a strategy to detect inner doubts of LLMs remains a major challenge for their safe use [<xref ref-type="bibr" rid="ref12">12</xref>], as reminded directly on the ChatGPT website (&#x201C;ChatGPT can make mistakes. Check important info.&#x201D;). However, there currently exists no straightforward solution to detect such potential mistakes.</p><p>Interestingly, LLMs operate on a statistical model, in which each word (or token) is associated with a probability [<xref ref-type="bibr" rid="ref10">10</xref>], which represents the model&#x2019;s inner confidence in its output. Leveraging this metric in medicine could help users to identify responses that may need expert review [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Despite the potential significance of token probabilities in approximating confidence levels, research in this area remains limited [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. Moreover, studies published so far have used laborious strategies, such as analyzing responses with other LLMs or answering the same question dozens of times to produce a single confidence estimate, a resource- and time-intensive strategy impractical in a clinical setting [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>].</p><p>Therefore, the primary objective of this study is to evaluate and compare the predictive abilities of expressed confidence and token probabilities in the most recent LLMs&#x2014;both commercial and open source&#x2014;across various multilingual datasets of medical licensing examination question-answer (Q-A). We seek to establish whether token probabilities offer a more accurate method for identifying inner doubts of LLMs, ultimately enhancing the reliability of chatbot-assisted decision-making in health care settings.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>This study is reported in accordance with the STROBE (Strengthening the Reporting of Observational Studies in Epidemiology) statement.</p><sec id="s2-1"><title>Models and Prompting</title><p>Models were selected to represent a diverse set of the top-performing LLMs on Measuring Massive Multitask Language Understanding in their respective size categories at the time of the study (May 2024). We selected a mix of commercial and open-source models to reflect the range of options available for practical applications. Additionally, we prioritized models with application programming interface (API) access or open weights to facilitate reproducibility. All selected models had to provide access to log probabilities in their API. As such, the Claude and Gemini models could not be used at the time of the study. Selected models were GPT-3.5, GPT-4, and GPT-4o (OpenAI), Llama 3.1-8b and Llama 3.1-70b (Meta), Phi-3-Mini and Phi-3-Medium (Microsoft), and Gemma 2-9b and Gemma 2-27b (Google). Detailed information on all the selected models is available in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. OpenAI models were accessed using the OpenAI API. All other models were accessed using Microsoft Azure&#x2019;s API. We used vanilla prompting (ie, no prompt engineering), as in previous studies [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref26">26</xref>]. The vanilla prompt for all datasets and models was as follows: &#x201C;Select the letter corresponding to the correct answer. In addition, please rate your confidence in your answer from 0 (not confident) to 100 (absolutely confident). Your output must follow this template, with no additional comment: &#x2018;The correct answer is: [letter]. My confidence level is: [number].&#x2019;&#x201D; Temperature was set to 0 to mitigate variability [<xref ref-type="bibr" rid="ref22">22</xref>], and a sensitivity analysis with a temperature of 0.5 was performed. Additional sensitivity analyses were performed using different prompting strategies: &#x201C;expert instruction prompt&#x201D; consisting of informing the model that it is a medical expert (&#x201C;You are a medical expert&#x201D; followed by the vanilla prompt), &#x201C;confidence scaling&#x201D; consisting of adjusting the requested expressed confidence on a scale from 0 to 1, and &#x201C;few-shot prompting,&#x201D; consisting of providing 3 examples of correct responses before the query.</p><p>Between May 29 and June 10, 2024, for the primary analysis and between December 13 and 23, 2024, for the sensitivity analyses, 9 LLMs were prompted to answer medical multiple-choice questions from datasets of medical licensing examinations in 3 languages (English, Chinese, and French; Figure S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). These Q-A databases include validated questions and answers for medical licensing examinations in the United States (USMLE MedQA, with step 1 [basic sciences and mechanisms] and steps 2 and 3 [clinical knowledge] questions) [<xref ref-type="bibr" rid="ref27">27</xref>], China (Mainland MCMLE MedQA) [<xref ref-type="bibr" rid="ref27">27</xref>], Taiwan (TWMLE MedQA) [<xref ref-type="bibr" rid="ref27">27</xref>], France (FrMedMCQA) [<xref ref-type="bibr" rid="ref28">28</xref>], and India (MedMCQA; Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref29">29</xref>]. Responses were parsed using a regular expression. Answers that did not follow the template were parsed using GPT-4o.</p></sec><sec id="s2-2"><title>Confidence Measure</title><p>Two metrics were used to gauge the model&#x2019;s confidence in the primary analysis: (1) the confidence expressed directly in the model&#x2019;s output, as previously reported [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref26">26</xref>], called expressed confidence; and (2) the probability of the token corresponding to the model&#x2019;s response (&#x201C;A,&#x201D; &#x201C;B,&#x201D; &#x201C;C,&#x201D; &#x201C;D,&#x201D; or &#x201C;E&#x201D;), retrieved directly from the API, called response token probability. Token probabilities (Figure S1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>) are the likelihood of each token to appear at a given point during the generation process, contingent on the preceding context and model parameters [<xref ref-type="bibr" rid="ref30">30</xref>]. In a Q-A context reliant on token values, token probabilities have been leveraged to implicitly evaluate labeling certainty [<xref ref-type="bibr" rid="ref20">20</xref>]. Response token probabilities were not normalized to assess the simplest measure available for end users.</p><p>In secondary analyses, two additional metrics were evaluated: (1) the Shannon entropy of the response token distribution, a measure of the dispersion of the probabilities of possible response tokens [<xref ref-type="bibr" rid="ref31">31</xref>]; and (2) perplexity, a measure of the overall probability of the answer generated by the model, defined as the product of one over the probability of each token that appeared in the response [<xref ref-type="bibr" rid="ref31">31</xref>].</p></sec><sec id="s2-3"><title>Statistical Analysis</title><p>Accuracy estimates, along with their 95% CIs, were compared across the 7 LLMs within each Q-A dataset using the Fisher exact test. The means and distribution of expressed confidence, token probability, Shannon entropy, and answer perplexity were visually represented using violin plots and compared based on response correctness. Predictive performance was assessed through receiver operating characteristic curves, presenting area under the receiver operating characteristic curve (AUROC) with 95% CIs, and compared between expressed confidence and response token probabilities using the DeLong test. Optimal discrimination thresholds were determined using the Youden <italic>J</italic> statistic. True positive rate, true negative rate, as well as accuracy rates above and below optimal discrimination threshold were estimated with 95% CIs and compared using McNemar tests. Model confidence calibration was evaluated using calibration curves and quantified based on both the adaptive calibration error (ACE), a measure of the average deviation across confidence bins, using adaptive binning to mitigate dataset imbalances, and Brier score, a measure of both accuracy and calibration by computing the mean squared difference between predicted probabilities and actual outcomes, with their respective 95% CI (bootstrap) [<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref33">33</xref>], both metrics were compared using bootstrapped <italic>P</italic> values. ACE greater than 0.25 indicates poor calibration. All statistical analyses were performed using R (version 4.4; R Foundation for Statistical Computing) and Python (version 3.12; Python Software Foundation), with a significance threshold set at 5%.</p></sec><sec id="s2-4"><title>Ethical Considerations</title><p>All data used for this study are publicly available and do not involve patient data. As such, no ethics review board assessment was necessary.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overall Performance</title><p>GPT-4o significantly outperforms all other tested models with an accuracy of 89% (95% CI 87.7&#x2010;90.2; <italic>P</italic>&#x003C;.001; Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Figure S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Phi-3-Mini, the smallest model tested, scores the lowest score of 56.5% (95% CI 54.6&#x2010;58.5). Considering a passing grade of 60%, GPT-3.5, GPT-4, GPT-4o, Llama 3.1-70b, Phi-3-Medium, and Gemma 2-27b pass the USMLE board with vanilla prompting. No model scores an &#x201C;expert grade&#x201D; of more than 90%. Similar performance patterns are visible with the other datasets, with larger models scoring higher than their smaller counterparts (Figure S2 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p><p>Different prompting strategies and temperatures had limited effect on the performance of tested models, except for GPT-3.5 Turbo, which performed significantly better on MedQA with few-shot prompting (accuracy of 63.5, 95% CI 61.6&#x2010;65.4 vs 60.1, 95% CI 58.2&#x2010;62.0; <italic>P</italic>=.01; Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Figure S3 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p></sec><sec id="s3-2"><title>Uncertainty Quantification</title><p>When elicited to assess their confidence in their answer, all models verbalized high scores, notably higher than 80%, in multiples of 5, and with limited dispersion around the median value. Expressed confidence ranged from 90 (95% CI 90-90) for Llama 3.1-70b to 100 (95% CI 100-100) for GPT-3.5. All models consistently expressed a confidence level exceeding 80/100 in their responses, regardless of the accuracy of the answer, particularly for GPT-3.5, GPT-4, Phi-3-Mini, and Phi-3-Medium (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Response token probabilities were also skewed toward high values for all models but displayed more dispersed values, ranging from 93 (95% CI 71-99) for Phi-3-Medium to 100 (95% CI 100-100) for GPT-4 (<xref ref-type="fig" rid="figure1">Figure 1</xref>). Response token probabilities were significantly more likely to be above 80/100 for correct answers than for incorrect ones, across all models (all <italic>P</italic>&#x003C;.001). As illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>, the qualitative analysis of response token probabilities revealed a frequent dispersion of probabilities among the options when the chatbot&#x2019;s response is incorrect. Contrarily, when the model&#x2019;s chosen response was correct, the token probability distribution is often almost entirely directed toward a single option (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Distributions of expressed confidence versus token probability of large language models on the US MedQA dataset (n=2487), according to the accuracy of model answers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64348_fig01.png"/></fig><p>The analyses replicated on other Q-A datasets confirmed the models&#x2019; tendency to express high levels of confidence (Tables S5-S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Figures S2-S5 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p></sec><sec id="s3-3"><title>Discriminative Power of Expressed Confidence Versus Response Token Probability</title><p>Overall, expressed confidence failed to predict the accuracy of responses, whereas response token probabilities demonstrated satisfactory to very good discriminatory performance. The AUROCs of expressed confidence ranged from 0.51 (95% CI 0.49&#x2010;0.53) for Phi-3-Mini to 0.70 (95% CI 0.67&#x2010;0.73) for GPT-4o (<xref ref-type="fig" rid="figure2">Figure 2A</xref> and <xref ref-type="table" rid="table1">Table 1</xref>). In all cases, response token probability markedly outperformed expressed confidence (all <italic>P</italic>&#x003C;.001), with AUROCs ranging from 0.71 (95% CI 0.69&#x2010;0.73) for Phi-3-Mini to 0.87 (95% CI 0.85&#x2010;0.89) for GPT-4o (<xref ref-type="fig" rid="figure2">Figure 2A</xref> and <xref ref-type="table" rid="table1">Table 1</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>(A) Receiver operating characteristic curve and discriminative power of expressed confidence versus perplexity, entropy, and token response probability in predicting answer accuracy of large language models. (B) Plot of the AUROC of expressed confidence and response token probability as a function of models&#x2019; accuracy on the US MedQA dataset (n=2487 questions). AUROC: area under the receiver operating characteristic curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64348_fig02.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Diagnostic performance accuracy of expressed confidence versus token response probability, according to the different models (US MedQA; n=2522 questions).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" colspan="2">Model and metric</td><td align="left" valign="bottom">Expressed confidence</td><td align="left" valign="bottom">Token probability</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">GPT-3.5 Turbo</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> (95% CI)</td><td align="left" valign="top">0.52 (0.51&#x2010;0.53)</td><td align="left" valign="top">0.7 (0.68&#x2010;0.72)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">98</td><td align="left" valign="top">98</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">95 (94&#x2010;96)</td><td align="left" valign="top">66 (64&#x2010;68)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">91 (89&#x2010;93)</td><td align="left" valign="top">37 (34&#x2010;40)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">61 (59&#x2010;63)</td><td align="left" valign="top">73 (71&#x2010;75)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">44 (36&#x2010;52)</td><td align="left" valign="top">45 (42&#x2010;48)</td><td align="left" valign="top">.84</td></tr><tr><td align="left" valign="top" colspan="5">GPT-4</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.61 (0.58&#x2010;0.63)</td><td align="left" valign="top">0.82 (0.81&#x2010;0.84)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">98</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">70 (68&#x2010;72)</td><td align="left" valign="top">71 (69&#x2010;73)</td><td align="left" valign="top">.44</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">51 (46&#x2010;55)</td><td align="left" valign="top">17 (14&#x2010;21)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">84 (82&#x2010;86)</td><td align="left" valign="top">94 (93&#x2010;95)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">70 (67&#x2010;73)</td><td align="left" valign="top">57 (54&#x2010;60)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="5">GPT-4o</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.7 (0.67&#x2010;0.73)</td><td align="left" valign="top">0.87 (0.85&#x2010;0.89)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">92</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">66 (64&#x2010;68)</td><td align="left" valign="top">79 (77&#x2010;81)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">26 (20&#x2010;31)</td><td align="left" valign="top">13 (8.8&#x2010;17)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">95 (94&#x2010;96)</td><td align="left" valign="top">98 (97&#x2010;99)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">79 (76&#x2010;81)</td><td align="left" valign="top">66 (63&#x2010;70)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="5">Llama 3.1-8b</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.56 (0.54&#x2010;0.58)</td><td align="left" valign="top">0.73 (0.71&#x2010;0.75)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">85</td><td align="left" valign="top">91</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">76 (74&#x2010;78)</td><td align="left" valign="top">64 (61&#x2010;66)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">63 (60&#x2010;66)</td><td align="left" valign="top">29 (26&#x2010;31)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">66 (64&#x2010;68)</td><td align="left" valign="top">78 (76&#x2010;81)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">52 (48&#x2010;55)</td><td align="left" valign="top">45 (43&#x2010;48)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="5">Llama 3.1-70b</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.58 (0.55&#x2010;0.6)</td><td align="left" valign="top">0.84 (0.82&#x2010;0.85)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">85</td><td align="left" valign="top">99</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">90 (89&#x2010;91)</td><td align="left" valign="top">66 (63&#x2010;68)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">72 (68&#x2010;76)</td><td align="left" valign="top">9.6 (7.1&#x2010;12)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">82 (81&#x2010;84)</td><td align="left" valign="top">96 (95&#x2010;97)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">58 (52&#x2010;63)</td><td align="left" valign="top">59 (56&#x2010;62)</td><td align="left" valign="top">.57</td></tr><tr><td align="left" valign="top" colspan="5">Phi-3-Mini</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.51 (0.49&#x2010;0.53)</td><td align="left" valign="top">0.71 (0.69&#x2010;0.73)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">98</td><td align="left" valign="top">92</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">55 (52&#x2010;57)</td><td align="left" valign="top">44 (42&#x2010;47)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">52 (49&#x2010;55)</td><td align="left" valign="top">12 (10&#x2010;14)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">58 (55&#x2010;60)</td><td align="left" valign="top">82 (80&#x2010;85)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">55 (52&#x2010;58)</td><td align="left" valign="top">45 (43&#x2010;48)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="5">Phi-3-Medium</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.57 (0.55&#x2010;0.59)</td><td align="left" valign="top">0.76 (0.74&#x2010;0.78)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">92</td><td align="left" valign="top">91</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">80 (78&#x2010;82)</td><td align="left" valign="top">67 (65&#x2010;70)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">68 (65&#x2010;71)</td><td align="left" valign="top">27 (24&#x2010;30)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">71 (69&#x2010;73)</td><td align="left" valign="top">84 (82&#x2010;86)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">56 (52&#x2010;60)</td><td align="left" valign="top">48 (45&#x2010;51)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="5">Gemma 2-9b</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.6 (0.57&#x2010;0.62)</td><td align="left" valign="top">0.74 (0.72&#x2010;0.75)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">92</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">40 (37&#x2010;42)</td><td align="left" valign="top">62 (59&#x2010;64)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">24 (22&#x2010;27)</td><td align="left" valign="top">26 (23&#x2010;28)</td><td align="left" valign="top">.40</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">71 (67&#x2010;74)</td><td align="left" valign="top">78 (75&#x2010;80)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (%) (95% CI)</td><td align="left" valign="top">54 (51&#x2010;56)</td><td align="left" valign="top">43 (40&#x2010;46)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="5">Gemma 2-27b</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">AUROC (95% CI)</td><td align="left" valign="top">0.56 (0.54&#x2010;0.58)</td><td align="left" valign="top">0.76 (0.74&#x2010;0.78)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Optimal threshold (%)</td><td align="left" valign="top">98</td><td align="left" valign="top">100</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">True positive rate (%) (95% CI)</td><td align="left" valign="top">17 (15&#x2010;19)</td><td align="left" valign="top">60 (57&#x2010;62)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">False positive rate (%) (95% CI)</td><td align="left" valign="top">9.5 (7.5&#x2010;11)</td><td align="left" valign="top">19 (16&#x2010;21)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct above threshold (%) (95% CI)</td><td align="left" valign="top">77 (72&#x2010;81)</td><td align="left" valign="top">85 (83&#x2010;87)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Correct below threshold (<italic>%)</italic> (95% CI)</td><td align="left" valign="top">62 (60&#x2010;64)</td><td align="left" valign="top">47 (45&#x2010;50)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AUROC: area under the receiver operating characteristic curve.</p></fn><fn id="table1fn2"><p><sup>b</sup>Not available.</p></fn></table-wrap-foot></table-wrap><p>There was a positive linear correlation between both expressed confidence and response token probability and the models&#x2019; accuracy, with the discriminative power of both metrics increasing with the accuracy of the model (<xref ref-type="fig" rid="figure2">Figure 2B</xref>).</p><p>The sensitivity analyses conducted on the other multilingual Q-A datasets confirmed the consistent superior discriminative capabilities of response token probability in predicting response accuracy compared to expressed confidence (Tables S5-S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Figures S4-S11 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). Slightly lower performance of the response token probability was observed on the MedQA Mainland China dataset for smaller models, with AUROCs for Phi-3-Mini at 0.56 (95% CI 0.54&#x2010;0.59) and Llama 3.1-8b at 0.59 (95% CI 0.57&#x2010;0.62).</p></sec><sec id="s3-4"><title>Predictive Performance of Expressed Confidence Versus Response Token Probability</title><p>Threshold values to optimize the models&#x2019; discriminative abilities to predict accuracy were high for both expressed confidence and response token probability. These values ranged from 85% (Llama 3.1-8b and Llama 3.1-70b) to 98% (GPT-3.5 Turbo, GPT4, Phi-3-Mini, and Gemma 2-27b) for expressed confidence and from 91% (Llama 3.1-8b and Phi-3-Medium) to 100% (GPT-4, GPT-4o, Gemma 2-9b, and Gemma 2-27b) for token probability (<xref ref-type="table" rid="table1">Table 1</xref>).</p><p>All models exhibited significantly higher accuracy when the response token probability exceeded its optimal threshold, ranging from 73% (95% CI 71-75) for GPT-3.5 Turbo to 98% (95% CI 97-99) for GPT-4o (all <italic>P</italic>&#x003C;.001). In comparison, when expressed confidence was at its respective optimal threshold, accuracy ranged from 58% (95% CI 55-60) for Phi-3-Mini to 95% (95% CI 94-96) for GPT-4o (all <italic>P</italic>&#x003C;.001; <xref ref-type="table" rid="table1">Table 1</xref>).</p><p>Moreover, the false positive rate (incorrect responses above threshold) was significantly lower for answers with high response token probability, ranging from 9.6% (95% CI 7.1&#x2010;12) for Llama 3.1-70b to 37% (95% CI 34-40) for GPT-3.5 Turbo (all <italic>P</italic>&#x003C;.001). In contrast, for answers with high expressed confidence, the false positive rates ranged from 9.5% (95% CI 7.5&#x2010;11) for Gemma 2-27b to 91% (95% CI 89-93) for GPT-3.5 Turbo (all <italic>P</italic>&#x003C;.001 except Gemma 2-9b, <italic>P</italic>=.40; <xref ref-type="table" rid="table1">Table 1</xref>). Similar results are observed across all other Q-A datasets (Tables S5-S8 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-5"><title>Calibration Error</title><p>All models exhibited a tendency toward overconfidence, with smaller models being the more poorly calibrated (<xref ref-type="fig" rid="figure3">Figure 3</xref>). ACE ranged from over 20%&#x2010;25% for Phi-3-Medium and Llama 3.1-8b to over 30% for GPT-3.5 and even exceeding 40% for Phi-3-Mini (<xref ref-type="fig" rid="figure3">Figure 3</xref>). In contrast, the larger models demonstrated satisfactory calibration for both expressed confidence and response token probability, notably less than 10% with GPT-4o. Overall, all models exhibited significantly lower Brier scores, meaning better calibration with response token probability, ranging from 0.09 (95% CI 0.08&#x2010;0.11) for GPT-4o to 0.35 (95% CI 0.33&#x2010;0.36) for Gemma 2-9b, than with expressed confidence, ranging from 0.10 (95% CI 0.09&#x2010;0.11) for GPT-4o to 0.42 (95% CI 0.40&#x2010;0.43) for Phi-3-Mini (all <italic>P</italic>&#x003C;.05; <xref ref-type="fig" rid="figure3">Figure 3</xref>). Similarly, ACEs were almost always lower with response token probability than with expressed confidence, although the larger models sometimes showed lower ACEs on expressed confidence. The same tendency toward overconfidence is highlighted across other Q-A datasets, with poor calibration observed for smaller models and a significantly lower Brier score with response token probability (Tables S9-S13 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and Figures S15-S20 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Calibration plots comparing expressed confidence versus token response probability in predicting answer accuracy of large language models (US MedQA dataset: n=2487 questions). ACE: adapted calibration error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64348_fig03.png"/></fig></sec><sec id="s3-6"><title>Effect of Knowledge Type</title><p>Response token probability generally performed similarly across step 1 and steps 2 and 3 items from MedQA for most models, with no significant differences observed between these knowledge categories. In particular, GPT-4 showed area under the curves (AUCs) of 0.80 (95% CI 0.82&#x2010;0.85) versus 0.79 (95% CI 0.82&#x2010;0.85; <italic>P</italic>=.92), GPT-4o had 0.84 (95% CI 0.87&#x2010;0.90) versus 0.85 (95% CI 0.88&#x2010;0.91; <italic>P</italic>=.68), and Llama 3.1-70b demonstrated 0.81 (95% CI 0.84&#x2010;0.86) versus 0.81 (95% CI 0.84&#x2010;0.86; <italic>P</italic>=.99). Similarly, GPT-3.5 Turbo, Llama 3.1-8b, Phi 3, and Gemma 2 models all displayed minimal AUC differences between basic sciences and clinical knowledge (all <italic>P</italic>&#x003E;.05; Tables S14-S16 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-7"><title>Effect of Prompting Strategies</title><p>Vanilla prompts generally performed on par with other prompting strategies for most models, with only small differences with the few-shot prompt. In particular, GPT-3.5 Turbo and Llama 3.1-8b showed significant advantages for few shot over vanilla (AUC increases from 0.70 to 0.74; <italic>P</italic>=.002 and 0.73 to 0.76; <italic>P</italic>=.01, respectively), while the remaining comparisons in GPT-4, GPT-4o, Llama 3.1-70b, Phi 3, and Gemma 2 models did not detect statistically meaningful deviations from vanilla (<xref ref-type="fig" rid="figure4">Figure 4</xref> and Table S17 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Effect of prompting techniques on receiver operating characteristic curve and discriminative power of the response tokens&#x2019; probability in predicting answer accuracy of large language models (US MedQA dataset: n=2487 questions). AUROC: area under the receiver operating characteristic curve; EC: expressed confidence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64348_fig04.png"/></fig></sec><sec id="s3-8"><title>Shannon Entropy and Answer Perplexity</title><p>Shannon entropy achieved performance comparable to token&#x2010;level probabilities across nearly all models, typically showing overlapping AUC estimates and minimal statistical deviations. For example, with GPT-4, the entropy AUC was 0.82 (95% CI 0.80&#x2010;0.84) versus 0.82 (95% CI 0.81&#x2010;0.84) for token probability (<italic>P</italic>=.41), and GPT-4o yielded 0.87 (95% CI 0.85&#x2010;0.89) for both metrics (<italic>P</italic>=.16). By contrast, perplexity consistently underperformed relative to token probability in all models, as exemplified by Llama 3.1-70b, which had an AUC of 0.77 (95% CI 0.75&#x2010;0.79) for perplexity and 0.84 (95% CI 0.83&#x2010;0.86) for token probability (<italic>P</italic>&#x003C;.001; Table S18 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and <xref ref-type="fig" rid="figure2">Figure 2A</xref>).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study is the first attempt to try to control the risk of misinformation in the use of medical chatbots by comparing the predictive value of expressed confidence and response token probabilities for various LLMs when addressing medical questions from diverse multilingual medical licensing examination datasets. The findings demonstrate the robust performance of LLMs in this domain, displaying an accuracy rate for USMLE ranging from approximately 60% for smaller models (GPT-3.5, Llama 3-8b, and Phi-3-Mini) to nearly 90% for GPT-4o, and mostly succeeding in medical board examinations across different languages. A slight drop in performance was observed for the smallest models on the Mainland China dataset, probably because of the underrepresentation of Chinese texts in the training data of these models. Notably, different prompting strategies had little to no impact on the performance of response token probabilities across all datasets and models tested. Furthermore, the results underscore a consistent tendency among LLMs to exhibit high expressed confidence in their responses, with scores typically ranging between 80% and 100%, regardless of their actual accuracy in most cases. While response token probabilities display a similar inclination due to their imperfect calibration, they markedly outperform expressed confidence in predicting response accuracy, with very good predictive abilities among the larger models.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study, however, has several limitations. First, we restricted our analysis to single-choice questions. Consequently, these findings may not be directly applicable to textual responses, for which the model&#x2019;s decision is not confined to a single token. While there are proposed methods for aggregating token probabilities, they are still under development [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. Second, we could only assess the effect of some variations of prompt-engineering techniques as well as 2 temperature settings, while other techniques and parameters could have potentially impacted the outcomes. However, we deliberately opted for using LLMs in their default configuration to mimic real-world use by standard users, as in previous reports [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Finally, our study did not exhaustively cover all available LLMs and languages. However, by integrating various chatbots of different sizes and architectures, both commercial and open source, we aimed to encompass a broad spectrum of models currently in use.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>Our findings on the performance of LLMs in achieving passing grades on board-style examinations align with existing literature. Initial studies demonstrated GPT-3.5&#x2019;s ability to score over 60% on the USMLE less than 2 years ago [<xref ref-type="bibr" rid="ref6">6</xref>]. Since then, both commercial and open-source models have improved their scores through vanilla prompting and prompt engineering. During this period, models have become larger and more capable, with GPT-4o achieving nearly expert-level grades. Simultaneously, lighter models, such as Llama 3.1-70b or Phi 3, which are at least 10 times smaller than GPT-3.5, are now also capable of scoring passing grades on the USMLE.</p><p>Despite the impressive performance of LLMs on medical benchmarks, our research supports previous findings regarding their tendency to consistently overestimate their confidence levels (see literature review in Table S19 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref34">34</xref>-<xref ref-type="bibr" rid="ref36">36</xref>]. Language models typically assess their confidence between 80 and 100, often in increments of 5, potentially imitating human patterns of expressing confidence [<xref ref-type="bibr" rid="ref26">26</xref>]. This tendency is even more obvious in smaller models, where confidence levels remain high regardless of response accuracy. In a recent study by Krishna et al [<xref ref-type="bibr" rid="ref12">12</xref>], ChatGPT-3.5 and 4 were asked to answer 150 radiology board-style multiple-choice text-based questions and rate their confidence level from 1 to 10. Both models consistently estimated their confidence level at 8/10 or higher, even when the response was incorrect (in 100% of cases for ChatGPT-3.5 and 77% for ChatGPT-4) [<xref ref-type="bibr" rid="ref12">12</xref>]. One underlying explanation for this overconfidence is the exposure of LLMs to exaggerated expressions in training data, where numbers are commonly used in a figurative manner [<xref ref-type="bibr" rid="ref13">13</xref>]. Recently, Farquhar et al [<xref ref-type="bibr" rid="ref23">23</xref>] have also explored probabilistic approaches to detect hallucinations in LLMs&#x2019; responses. However, their method relies on a comprehensive analysis of all answering possibilities for every query, which implies significant computational complexity and costs. Such a method is not compatible with commercial models run on the cloud such as ChatGPT, and as such, seems currently impractical in a clinical setting. Additionally, their study did not include medical questions and languages other than English, leaving an easily implementable method for patients and physicians still to be explored [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Distinguishing when language models know and when they hesitate is crucial in high-stakes settings such as decision-making in medicine. Kung et al [<xref ref-type="bibr" rid="ref37">37</xref>] demonstrated that the density of insight in GPT-3.5&#x2019;s responses to the USMLE was lower when the model provided incorrect answers. However, only an expert can accurately assess the actual amount of knowledge contained in a language model&#x2019;s output. We hypothesize that this lack of insight could manifest as a lower probability in the answer token. When the model knows the answer, it outputs a token with near-certain probability. Conversely, when it doubts because of a lack of insight, the probabilities are distributed across multiple possible answers, lowering the response token probability. Our results showing a correlation between models&#x2019; accuracy and predictive value of both expressed confidence and response token probability further strengthen this hypothesis. Higher-performing models have learned more robust internal representations, thus allowing them to generate more meaningful token probabilities as well as identify what they do not know.</p><p>LLMs demonstrate high performance in medical tasks [<xref ref-type="bibr" rid="ref37">37</xref>], suggesting their imminent integration into patient daily lives and physician workflows. Assessing the risk of incorrect answers would further increase the trustworthiness of those emerging assistants by identifying potentially doubtful answers that could need human review. Because response token probabilities do not need advanced prompting techniques, they could be easily adopted by the medical community as a way to implicitly measure the ability of models to express their doubts. When LLMs are deployed for labeling large amounts of data, be it for extracting information from free text or correcting errors in medical reports, response token probabilities could pinpoint cases necessitating expert review, thereby reducing error risk, human effort, and financial costs [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref38">38</xref>].</p></sec><sec id="s4-4"><title>Future Directions</title><p>Future work should focus on investigating uncertainty estimation methods in contexts beyond closed-form, single-token responses, such as open-ended reasoning tasks or multistep problem-solving. With the release of reasoning models, integrating Chain-of-Thought architectures will also pose greater challenges for uncertainty estimation, as their decision process involves multiple reasoning steps rather than a single-token prediction. Additionally, assessing the impact of fine-tuning models on medical-specific corpora to determine whether expressed confidence calibration can be improved. Finally, future work will need to explore the practical implementation of uncertainty estimation methods in real-world settings, particularly in clinical applications. This includes evaluating how these measures can be effectively integrated into medical decision-making workflows, ensuring that they are interpretable and actionable for health care professionals, and designing interfaces that facilitate their adoption in high-stakes environments.</p></sec><sec id="s4-5"><title>Conclusions</title><p>Our study underscores the robust performance of language models in addressing medical questions across diverse multilingual databases. Token probabilities emerge as a promising alternative for predicting response accuracy, counterbalancing a consistent tendency toward overconfidence, warranting further investigation for enhanced model confidence estimation.</p></sec></sec></body><back><fn-group><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">ACE</term><def><p>adaptive calibration error</p></def></def-item><def-item><term id="abb2">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb3">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb4">AUROC</term><def><p>area under the receiver operating characteristic curve</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">Q-A</term><def><p>question-answer</p></def></def-item><def-item><term id="abb7">STROBE</term><def><p>Strengthening the Reporting of Observational Studies in Epidemiology</p></def></def-item><def-item><term id="abb8">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>22</day><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="medline">37606976</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riepenhausen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plagwitz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Varghese</surname><given-names>J</given-names> </name></person-group><article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title><source>Nat Commun</source><year>2024</year><month>03</month><day>6</day><volume>15</volume><issue>1</issue><fpage>2050</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id><pub-id pub-id-type="medline">38448475</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le Guellec</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lef&#x00E8;vre</surname><given-names>A</given-names> </name><name name-style="western"><surname>Geay</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Performance of an open-source large language model in extracting information from free-text radiology reports</article-title><source>Radiol Artif Intell</source><year>2024</year><month>07</month><volume>6</volume><issue>4</issue><fpage>e230364</fpage><pub-id pub-id-type="doi">10.1148/ryai.230364</pub-id><pub-id pub-id-type="medline">38717292</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reynolds</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tejasvi</surname><given-names>T</given-names> </name></person-group><article-title>Potential use of ChatGPT in responding to patient questions and creating patient resources</article-title><source>JMIR Dermatol</source><year>2024</year><month>03</month><day>6</day><volume>7</volume><issue>1</issue><fpage>e48451</fpage><pub-id pub-id-type="doi">10.2196/48451</pub-id><pub-id pub-id-type="medline">38446541</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Hassan</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mahmood</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Trialling a large language model (ChatGPT) in general practice with the applied knowledge test: observational study demonstrating opportunities and limitations in primary care</article-title><source>JMIR Med Educ</source><year>2023</year><month>04</month><day>21</day><volume>9</volume><fpage>e46599</fpage><pub-id pub-id-type="doi">10.2196/46599</pub-id><pub-id pub-id-type="medline">37083633</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Large language model uncertainty proxies: discrimination and calibration for medical diagnosis and treatment</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>01</month><day>1</day><volume>32</volume><issue>1</issue><fpage>139</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae254</pub-id><pub-id pub-id-type="medline">39396184</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Choudhury</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shamszare</surname><given-names>H</given-names> </name></person-group><article-title>Investigating the impact of user trust on the adoption and use of ChatGPT: survey analysis</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>14</day><volume>25</volume><fpage>e47184</fpage><pub-id pub-id-type="doi">10.2196/47184</pub-id><pub-id pub-id-type="medline">37314848</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Krishna</surname><given-names>S</given-names> </name><name name-style="western"><surname>Bhambra</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bleakney</surname><given-names>R</given-names> </name><name name-style="western"><surname>Bhayana</surname><given-names>R</given-names> </name></person-group><article-title>Evaluation of reliability, repeatability, robustness, and confidence of GPT-3.5 and GPT-4 on a radiology board-style examination</article-title><source>Radiology</source><year>2024</year><month>05</month><volume>311</volume><issue>2</issue><fpage>e232715</fpage><pub-id pub-id-type="doi">10.1148/radiol.232715</pub-id><pub-id pub-id-type="medline">38771184</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>K</given-names> </name><name name-style="western"><surname>Hwang</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Ren</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sap</surname><given-names>M</given-names> </name></person-group><article-title>Relying on the unreliable: the impact of language models&#x2019; reluctance to express uncertainty</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 9, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2401.06730</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nov</surname><given-names>O</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Mann</surname><given-names>D</given-names> </name></person-group><article-title>Putting ChatGPT&#x2019;s medical advice to the (Turing) test: survey study</article-title><source>JMIR Med Educ</source><year>2023</year><month>07</month><day>10</day><volume>9</volume><fpage>e46939</fpage><pub-id pub-id-type="doi">10.2196/46939</pub-id><pub-id pub-id-type="medline">37428540</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Begoli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bhattacharya</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kusnezov</surname><given-names>D</given-names> </name></person-group><article-title>The need for uncertainty quantification in machine-assisted medical decision making</article-title><source>Nat Mach Intell</source><year>2019</year><month>01</month><day>7</day><volume>1</volume><issue>1</issue><fpage>20</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1038/s42256-018-0004-1</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Groot</surname><given-names>T</given-names> </name><name name-style="western"><surname>Valdenegro-Toro</surname><given-names>M</given-names> </name></person-group><article-title>Overconfidence is key: verbalized uncertainty evaluation in large language and vision-language models</article-title><source>arXiv</source><comment>Preprint posted online on  May 5, 2024</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2405.02917</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Berner</surname><given-names>ES</given-names> </name><name name-style="western"><surname>Graber</surname><given-names>ML</given-names> </name></person-group><article-title>Overconfidence as a cause of diagnostic error in medicine</article-title><source>Am J Med</source><year>2008</year><month>05</month><volume>121</volume><issue>5 Suppl</issue><fpage>S2</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2008.01.001</pub-id><pub-id pub-id-type="medline">18440350</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>SSY</given-names> </name><name name-style="western"><surname>Liao</surname><given-names>QV</given-names> </name><name name-style="western"><surname>Vorvoreanu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ballard</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vaughan</surname><given-names>JW</given-names> </name></person-group><article-title>&#x201C;I&#x2019;m not sure, but...&#x201D;: examining the impact of large language models&#x2019; uncertainty expression on user reliance and trust</article-title><conf-name>The 2024 ACM Conference on Fairness, Accountability, and Transparency</conf-name><conf-date>2024</conf-date><conf-loc>Rio de Janeiro, Brazil</conf-loc><fpage>822</fpage><lpage>835</lpage><pub-id pub-id-type="doi">10.1145/3630106.3658941</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goodwin</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Hannah</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Nicholl</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Ferri</surname><given-names>JM</given-names> </name></person-group><article-title>The confident co&#x2010;witness: the effects of misinformation on memory after collaborative discussion</article-title><source>Appl Cogn Psychol</source><year>2017</year><month>03</month><volume>31</volume><issue>2</issue><fpage>225</fpage><lpage>235</lpage><pub-id pub-id-type="doi">10.1002/acp.3320</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>M</given-names> </name></person-group><article-title>Want to reduce labeling cost? GPT-3 can help</article-title><conf-name>Findings of the Association for Computational Linguistics: EMNLP 2021</conf-name><conf-date>Nov 7-11, 2021</conf-date><conf-loc>Punta Cana, Dominican Republic</conf-loc><fpage>4195</fpage><lpage>4205</lpage><pub-id pub-id-type="doi">10.18653/v1/2021.findings-emnlp.354</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Quevedo</surname><given-names>E</given-names> </name><name name-style="western"><surname>Yero</surname><given-names>J</given-names> </name><name name-style="western"><surname>Koerner</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rivas</surname><given-names>P</given-names> </name><name name-style="western"><surname>Cerny</surname><given-names>T</given-names> </name></person-group><article-title>Detecting hallucinations in large language model generation: a token probability approach</article-title><source>arXiv</source><comment>Preprint posted online on  May 30, 2024</comment><pub-id pub-id-type="doi">10.1007/978-3-031-86623-4_13</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li&#x00E9;vin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Hother</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Motzfeldt</surname><given-names>AG</given-names> </name><name name-style="western"><surname>Winther</surname><given-names>O</given-names> </name></person-group><article-title>Can large language models reason about medical questions?</article-title><source>Patterns (N Y)</source><year>2024</year><month>03</month><day>8</day><volume>5</volume><issue>3</issue><fpage>100943</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2024.100943</pub-id><pub-id pub-id-type="medline">38487804</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farquhar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kossen</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kuhn</surname><given-names>L</given-names> </name><name name-style="western"><surname>Gal</surname><given-names>Y</given-names> </name></person-group><article-title>Detecting hallucinations in large language models using semantic entropy</article-title><source>Nature New Biol</source><year>2024</year><month>06</month><volume>630</volume><issue>8017</issue><fpage>625</fpage><lpage>630</lpage><pub-id pub-id-type="doi">10.1038/s41586-024-07421-0</pub-id><pub-id pub-id-type="medline">38898292</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Levine</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Tuwani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kompa</surname><given-names>B</given-names> </name><etal/></person-group><article-title>The diagnostic and triage accuracy of the GPT-3 artificial intelligence model: an observational study</article-title><source>Lancet Digit Health</source><year>2024</year><month>08</month><volume>6</volume><issue>8</issue><fpage>e555</fpage><lpage>e561</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(24)00097-9</pub-id><pub-id pub-id-type="medline">39059888</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rydzewski</surname><given-names>NR</given-names> </name><name name-style="western"><surname>Dinakaran</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>SG</given-names> </name><etal/></person-group><article-title>Comparative evaluation of LLMs in clinical oncology</article-title><source>NEJM AI</source><year>2024</year><month>05</month><volume>1</volume><issue>5</issue><pub-id pub-id-type="doi">10.1056/aioa2300151</pub-id><pub-id pub-id-type="medline">39131700</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xiong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Can LLMs express their uncertainty? An empirical evaluation of confidence elicitation in LLMs</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 17, 2023</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2306.13063</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Fang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Szolovits</surname><given-names>P</given-names> </name></person-group><article-title>What disease does this patient have? A large-scale open domain question answering dataset from medical exams</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 28, 2020</comment><pub-id pub-id-type="doi">10.48550/ARXIV.2009.13081</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Labrak</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bazoge</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dufour</surname><given-names>R</given-names> </name><etal/></person-group><article-title>FrenchMedMCQA: a french multiple-choice question answering dataset for medical domain</article-title><conf-name>Proceedings of the 13th International Workshop on Health Text Mining and Information Analysis (LOUHI)</conf-name><conf-date>Dec 7, 2022</conf-date><conf-loc>Abu Dhabi, United Arab Emirates (Hybrid</conf-loc><fpage>41</fpage><lpage>46</lpage><pub-id pub-id-type="doi">10.18653/v1/2022.louhi-1.5</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Umapathi</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Sankarasubbu</surname><given-names>M</given-names> </name></person-group><article-title>MedMCQA: a large-scale multi-subject multi-choice dataset for medical domain question answering</article-title><access-date>2024-06-18</access-date><conf-name>Proceedings of the Conference on Health, Inference, and Learning</conf-name><conf-date>Apr 2022</conf-date><fpage>248</fpage><lpage>260</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v174/pal22a.html">https://proceedings.mlr.press/v174/pal22a.html</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Chutani</surname><given-names>G</given-names> </name></person-group><article-title>Unlocking LLM confidence through logprobs</article-title><source>Medium</source><year>2024</year><access-date>2024-06-18</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://gautam75.medium.com/unlocking-llm-confidence-through-logprobs-54b26ed1b48a">https://gautam75.medium.com/unlocking-llm-confidence-through-logprobs-54b26ed1b48a</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Huyen</surname><given-names>C</given-names> </name></person-group><article-title>Evaluation metrics for language modeling</article-title><source>The Gradient</source><year>2019</year><access-date>2025-03-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://thegradient.pub/understanding-evaluation-metrics-for-language-models/">https://thegradient.pub/understanding-evaluation-metrics-for-language-models/</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nixon</surname><given-names>J</given-names> </name><name name-style="western"><surname>Dusenberry</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jerfel</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Measuring calibration in deep learning</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 2, 2019</comment><pub-id pub-id-type="doi">10.48550/ARXIV.1904.01685</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name><name name-style="western"><surname>Harrell</surname><given-names>FE</given-names>  <suffix>Jr</suffix></name><name name-style="western"><surname>Borsboom</surname><given-names>G</given-names> </name><name name-style="western"><surname>Eijkemans</surname><given-names>MJC</given-names> </name><name name-style="western"><surname>Vergouwe</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Habbema</surname><given-names>JDF</given-names> </name></person-group><article-title>Internal validation of predictive models</article-title><source>J Clin Epidemiol</source><year>2001</year><month>08</month><volume>54</volume><issue>8</issue><fpage>774</fpage><lpage>781</lpage><pub-id pub-id-type="doi">10.1016/S0895-4356(01)00341-9</pub-id><pub-id pub-id-type="medline">11470385</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wada</surname><given-names>A</given-names> </name><name name-style="western"><surname>Akashi</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shih</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Optimizing GPT-4 Turbo diagnostic accuracy in neuroradiology through prompt engineering and confidence thresholds</article-title><source>Diagnostics (Basel)</source><year>2024</year><month>07</month><day>17</day><volume>14</volume><issue>14</issue><fpage>1541</fpage><pub-id pub-id-type="doi">10.3390/diagnostics14141541</pub-id><pub-id pub-id-type="medline">39061677</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kufel</surname><given-names>J</given-names> </name><name name-style="western"><surname>Paszkiewicz</surname><given-names>I</given-names> </name><name name-style="western"><surname>Biel&#x00F3;wka</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Will ChatGPT pass the Polish specialty exam in radiology and diagnostic imaging? Insights into strengths and limitations</article-title><source>Pol J Radiol</source><year>2023</year><volume>88</volume><fpage>e430</fpage><lpage>e434</lpage><pub-id pub-id-type="doi">10.5114/pjr.2023.131215</pub-id><pub-id pub-id-type="medline">37808173</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schubert</surname><given-names>MC</given-names> </name><name name-style="western"><surname>Wick</surname><given-names>W</given-names> </name><name name-style="western"><surname>Venkataramani</surname><given-names>V</given-names> </name></person-group><article-title>Performance of large language models on a neurology board-style examination</article-title><source>JAMA Netw Open</source><year>2023</year><month>12</month><day>1</day><volume>6</volume><issue>12</issue><fpage>e2346721</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.46721</pub-id><pub-id pub-id-type="medline">38060223</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLOS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gertz</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Dratsch</surname><given-names>T</given-names> </name><name name-style="western"><surname>Bunck</surname><given-names>AC</given-names> </name><etal/></person-group><article-title>Potential of GPT-4 for detecting errors in radiology reports: implications for reporting accuracy</article-title><source>Radiology</source><year>2024</year><month>04</month><volume>311</volume><issue>1</issue><fpage>e232714</fpage><pub-id pub-id-type="doi">10.1148/radiol.232714</pub-id><pub-id pub-id-type="medline">38625012</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional tables.</p><media xlink:href="jmir_v27i1e64348_app1.docx" xlink:title="DOCX File, 102 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Additional figures.</p><media xlink:href="jmir_v27i1e64348_app2.docx" xlink:title="DOCX File, 10083 KB"/></supplementary-material></app-group></back></article>