<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e86498</article-id><article-id pub-id-type="doi">10.2196/86498</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Confidence Measurement Metrics in Multimodal Large Language Models for Ultrasound-Based Radiology Cases: Comparative Evaluation Study of Self-Reported, Consistency-Based, and Hybrid Methods</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Han</surname><given-names>Taewon</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Shin</surname><given-names>Jaeseung</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Lee</surname><given-names>Jeong Hyun</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gu</surname><given-names>Kyowon</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib></contrib-group><aff id="aff1"><institution>Department of Radiology, Samsung Medical Center</institution><addr-line>81 Irwon-ro, Irwon-dong - Gangnam-gu</addr-line><addr-line>Seoul</addr-line><country>Republic of Korea</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Mitra</surname><given-names>Avijit</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hu</surname><given-names>Danqing</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Epstein</surname><given-names>Elliot L</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Sevenster</surname><given-names>Merlijn</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Jaeseung Shin, MD, PhD, Department of Radiology, Samsung Medical Center, 81 Irwon-ro, Irwon-dong - Gangnam-gu, Seoul, 06351, Republic of Korea, 82 10-8714-7650; <email>dr.shinjs@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>2</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e86498</elocation-id><history><date date-type="received"><day>25</day><month>10</month><year>2025</year></date><date date-type="rev-recd"><day>15</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>18</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Taewon Han, Jaeseung Shin, Jeong Hyun Lee, Kyowon Gu. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 2.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e86498"/><abstract><sec><title>Background</title><p>Large language models (LLMs) require specialized methodologies to quantify model confidence for safe deployment in health care systems; however, there is a lack of established methods for confidence assessment.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate confidence metrics for multimodal LLMs interpreting ultrasound-based radiology cases and to compare self-reported, consistency-based, and hybrid methods.</p></sec><sec sec-type="methods"><title>Methods</title><p>From a total of 330 quizzes on the Korean Society of Ultrasound in Medicine digital platform, we selected 94 multiple-choice cases. Four multimodal LLMs were evaluated: 3 reasoning models (GPT-5, Claude-4.5-Sonnet, and Gemini-3-Pro) and 1 general model (GPT-4o). Temperature was fixed at 1.0. Multiple confidence metrics were assessed: (1) self-reported metrics generated by LLMs using prompts that elicited direct confidence percentages with answers, including first self-reported confidence and mean self-reported confidence; (2) consistency-based metrics derived from 20 repeated outputs per case, including relative entropy calculated as 1 &#x2212; H/log<sub>2</sub> k (H=Shannon entropy, k=number of answer choices) and majority-vote percentage; and (3) a Top Weighted Score combining response frequency with self-reported confidence. Receiver operating characteristic analysis for discrimination and Spearman correlation between accuracy and each confidence metric was conducted. Additionally, model calibration was assessed using expected calibration error and Brier score. Processing time and token consumption (input, output, and total) were recorded for each application programming interface call to evaluate resource use across models.</p></sec><sec sec-type="results"><title>Results</title><p>Diagnostic accuracy varied across models, with Gemini-3-Pro achieving the highest accuracy (70/94, 74.47%), surpassing the median human accuracy (59%, IQR 40.3%-75%). Top Weighted Score, a hybrid metric combining response frequency and self-reported confidence, was the only metric achieving statistically significant correlations across all 4 models: Gemini-3-Pro (&#x03C1;=0.52), GPT-5 (&#x03C1;=0.43), Claude-4.5-Sonnet (&#x03C1;=0.30), and GPT-4o (&#x03C1;=0.22). Receiver operating characteristic analysis revealed that Top Weighted Score demonstrated the highest discriminative ability, with area under the curve values of 0.826 (95% CI 0.731&#x2010;0.920) for Gemini-3-Pro and 0.767 (95% CI 0.668&#x2010;0.866) for GPT-5. Top Weighted Score was the only metric achieving statistical significance in GPT-4o. Calibration analysis showed that Top Weighted Score achieved the lowest expected calibration error in GPT-5 (0.098) and Claude-4.5-Sonnet (0.192), while Gemini-3-Pro showed comparable calibration between relative entropy (0.119) and Top Weighted Score (0.122). Resource use analysis demonstrated that reasoning models required substantially longer processing times and higher token consumption compared to general models.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>In multimodal LLMs applied to ultrasound-based radiology cases, hybrid methods (Top Weighted Score) demonstrated significant associations across all evaluated models and appear to serve as more reliable indicators of diagnostic confidence compared to self-reported or consistency-based metrics alone, although the strength of these associations varied across models, and external validation is warranted before broader clinical application. These findings support integrative confidence estimation approaches that incorporate response consistency while highlighting the need for resource-efficient sampling strategies to enable practical clinical deployment.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>AI</kwd><kwd>radiology</kwd><kwd>medical informatics</kwd><kwd>diagnostic confidence</kwd><kwd>large language models</kwd><kwd>LLMs</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The integration of large language models (LLMs) into clinical workflows is accelerating from promise to practice [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. These advancements necessitate robust frameworks for evaluating output reliability to ensure patient safety when LLMs are used for medical decision-making [<xref ref-type="bibr" rid="ref4">4</xref>]. Of particular concern is the calibration of LLMs&#x2014;the alignment between a model&#x2019;s confidence and its true accuracy&#x2014;as poorly calibrated LLMs may deliver inaccurate responses with inappropriately high confidence [<xref ref-type="bibr" rid="ref5">5</xref>], potentially introducing significant risks to patients through downstream diagnostic and treatment errors [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Unlike traditional probabilistic classifiers (eg, logistic regression or convolutional neural networks) that expose an explicit class probability for each prediction, LLMs generate text sequentially using probabilities but may present answers confidently despite substantial uncertainty in their underlying probability distributions, resulting in overconfidence issues [<xref ref-type="bibr" rid="ref4">4</xref>]. This tendency toward overconfidence complicates the safe deployment of LLMs in health care systems and underscores the need for specialized methodologies to quantify model confidence for clinical end users [<xref ref-type="bibr" rid="ref4">4</xref>].</p><p>While existing deep learning models have demonstrated established methods for uncertainty quantification in medical artificial intelligence through various approaches [<xref ref-type="bibr" rid="ref7">7</xref>-<xref ref-type="bibr" rid="ref9">9</xref>], architecturally different LLMs still lack standardized methodologies. Several approaches have been proposed to estimate the confidence of LLM outputs [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. In the self-reported method, the model is explicitly prompted to assign a numerical confidence score, typically 0% to 100%, to its own answer [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Another approach uses sample consistency, leveraging the stochastic behavior of LLMs by running the same prompt multiple times and estimating confidence from the agreement or entropy of the resulting responses [<xref ref-type="bibr" rid="ref14">14</xref>]. Additional methods include directly using token-level log probabilities to quantify confidence [<xref ref-type="bibr" rid="ref15">15</xref>]. Despite these various methodologies, a best-practice standard for assessing LLM confidence has not yet been established.</p><p>Given the lack of established best practices for confidence assessment, a systematic appraisal of available techniques is essential before these systems can be deployed in clinical practice. Therefore, this study aims to evaluate confidence measurement metrics of multimodal LLMs tasked with ultrasound-based radiological cases assessing whether these approaches can serve as reliable indicators of diagnostic confidence.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>This investigation used publicly available educational datasets, obviating the need for institutional review board approval or informed consent. All quiz materials from the Korean Society of Ultrasound in Medicine (KSUM) website were previously deidentified before public release. No compensation was involved in this study, and no identifiable individuals appear in any images within the manuscript or supplementary materials.</p></sec><sec id="s2-2"><title>Dataset</title><p>A total of 330 case discussion quizzes were extracted by a radiologist (TH, with 4 years of experience) from the KSUM digital platform [<xref ref-type="bibr" rid="ref16">16</xref>], published between July 28, 2000, and December 25, 2025. The radiologist systematically collected imaging data, question content with corresponding multiple-choice options, and relevant imaging information, including imaging modality and anatomical site. We excluded 236 cases without multiple-choice formats to maintain measurement reliability, resulting in a final dataset of 94 quiz cases (<xref ref-type="fig" rid="figure1">Figure 1</xref>). These quiz cases encompassed various imaging modalities, with some cases featuring challenging diagnostic scenarios. To focus on multimodal capabilities, we standardized the brief clinical text to include only patient demographics (age and sex) and chief complaint or previous medical history. The ground truth for our study was established using officially designated answers from the KSUM platform. Human performance benchmarks were derived from response statistics of KSUM platform subscribers, mainly radiologists and radiology trainees with varying degrees of expertise.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart depicting the evaluation process for consistency metrics across 3 reasoning large language models and 1 general large language model. AUC: area under the curve; KSUM: Korean Society of Ultrasound in Medicine; LLM: large language model; R_H: relative entropy; ROC: receiver operating characteristic.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86498_fig01.png"/></fig></sec><sec id="s2-3"><title>Multimodal LLMs</title><p>We selected four multimodal LLMs, including three reasoning models&#x2014;(1) GPT-5 (Alias: 2025-08-07; OpenAI) [<xref ref-type="bibr" rid="ref17">17</xref>]; (2) Claude-4.5-Sonnet (Alias: 2025-09-29; Anthropic) [<xref ref-type="bibr" rid="ref18">18</xref>]; (3) Gemini-3-Pro (Alias: 2025-11-18; Google) [<xref ref-type="bibr" rid="ref19">19</xref>]&#x2014;and one general model&#x2014;GPT-4o (Alias: 2024-11-20; OpenAI) [<xref ref-type="bibr" rid="ref20">20</xref>]. The temperature was fixed at 1.0 across all models because the reasoning models do not permit adjustment, and previous literature reports optimal performance at this setting [<xref ref-type="bibr" rid="ref21">21</xref>]. Additionally, enhanced reasoning capabilities were activated for applicable models: &#x201C;reasoning effort&#x201D; and &#x201C;thinking level&#x201D; were set to &#x201C;high&#x201D; for GPT-5 and Gemini-3-Pro-Preview, and &#x201C;thinking&#x201D; mode was enabled for Claude-4.5-Sonnet.</p></sec><sec id="s2-4"><title>Confidence Measurement Metrics</title><p>Radiological images were paired with brief text prompts that described the clinical question and key imaging information. We evaluated self-reported, consistency-based, and hybrid confidence metrics while simultaneously measuring diagnostic accuracy (<xref ref-type="fig" rid="figure1">Figure 1</xref>). For each case, each model generated 20 independent outputs, enabling the analysis of consistency-based metrics.</p><p>First, a relative entropy&#x2013;based score (R_H) was calculated as R_H=1 &#x2212; H/log<sub>&#x2082;</sub> k, where H=&#x2212;&#x2211; (<italic>i</italic>=1 to k) p<sub><italic>i</italic></sub> log<sub>2</sub>; p<sub><italic>i</italic></sub> represents Shannon entropy, p<sub><italic>i</italic></sub> represents the relative frequency of option <italic>i</italic> across repeated model outputs, and k=5 is the number of answer choices (so log<sub>&#x2082;</sub>k&#x2248;2.322 bits). R_H ranges from 0 (maximum entropy, complete inconsistency) to 1 (zero entropy, perfect consistency); for example, a response pattern of [A A A A A] yields R_H=1, whereas [A B C D E] yields R_H=0. Intermediate patterns receive scores reflecting their coherence; [A A A B B] would yield a higher R_H value than [A A A B C] because of greater consistency. The raw Shannon entropy (H) was also reported alongside R_H to provide an unnormalized measure.</p><p>Second, a majority-vote percentage recorded the proportion of the most frequent response in repeated trials. Both [A A A B B] and [A A A B C], for instance, produce a modal proportion of 60%, illustrating that this metric ignores differences in the dispersion of the remaining responses.</p><p>Third, a weighted confidence score was calculated for each answer option as follows: (frequency &#x00D7; mean self-reported confidence)/n, where frequency is the count of that option across repeated trials, mean self-reported confidence is the average confidence rating for that option, and n is the total number of repetitions. The Top Weighted Score was defined as the highest score among all options. For example, if option A appeared 12 times with a mean confidence of 80% and option B appeared 8 times with a mean confidence of 90%, the weighted scores would be (12&#x00D7;80)/20=48 for A and (8&#x00D7;90)/20=36 for B, yielding a Top Weighted Score of 48.</p><p>In parallel, each model was prompted to append a numerical self-confidence rating (0%&#x2010;100%) to each answer. Two self-reported confidence metrics were derived: (1) first self-reported confidence, which used the confidence rating from the first response and (2) mean self-reported confidence, which averaged the confidence ratings across all responses that selected the majority-vote answer. The prompt instructed models to select the diagnosis based on radiological findings and clinical context before providing the final answer in JSON format, including a confidence score on a 0%&#x2010;100% scale (eg, &#x201C;Select the diagnosis with the highest probability&#x2026; Provide your final answer in the following JSON format: answer: A-E, confidence: 0%&#x2010;100%&#x201D;). The exact templates are presented in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>To determine the minimum repetition count required for consistency-based metrics, analyses were additionally conducted with 5, 10, and 15 repeated outputs per case.</p><p>For ROC AUC, Spearman correlation, and calibration analyses, each confidence metric was paired with the diagnostic accuracy of its corresponding representative answer. For consistency-based metrics (R_H and majority-vote percentage) and mean self-reported confidence, the representative answer was the majority-voted option across repeated outputs. For the Top Weighted Score, the representative answer was the option receiving the highest weighted score. For the first self-reported confidence, the representative answer was the model&#x2019;s first response. Diagnostic accuracy was assessed by comparing the corresponding representative answer for each metric with the KSUM ground truth.</p></sec><sec id="s2-5"><title>Resource Use</title><p>To evaluate the trade-off between confidence estimation reliability and resource efficiency, we recorded the processing time and token consumption for each model query. Processing time was measured as the duration from application programming interface (API) request submission to response completion. Token usage was recorded as input tokens (text prompt and image data), output tokens (model-generated response), and total tokens consumed per query. We calculated the cumulative processing time and token consumption required to analyze a single quiz case for each repetition count condition (1, 5, 10, 15, and 20).</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>Accuracy for each model and repetition count (1, 5, 10, 15, and 20) was quantified as the proportion of correct answers. Differences across repetition counts were tested using the Cochran <italic>Q</italic> test [<xref ref-type="bibr" rid="ref22">22</xref>]. Discriminative ability was evaluated using receiver operating characteristic (ROC) area under the curve (AUC) with 95% CIs estimated by the DeLong method [<xref ref-type="bibr" rid="ref23">23</xref>], using each confidence metric as a predictor of diagnostic accuracy (correct vs incorrect). Spearman correlation coefficients (&#x03C1;) measured the association between each confidence measurement and diagnostic accuracy (correct vs incorrect). Correlations were interpreted as negligible (|&#x03C1;|&#x2264;0.10), weak (0.10&#x003C;|&#x03C1;|&#x2264;0.39), moderate (0.39&#x003C;|&#x03C1;|&#x2264;0.69), strong (0.69&#x003C;|&#x03C1;|&#x2264;0.89), or very strong (|&#x03C1;|&#x003E;0.89) [<xref ref-type="bibr" rid="ref24">24</xref>]. Calibration, defined as the alignment between predicted confidence and actual accuracy, was evaluated using expected calibration error (ECE) and Brier scores [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>], using fixed 10-bin calibration, with 95% CIs estimated via bootstrap resampling (1000 iterations). The Brier score was calculated based on binary diagnostic accuracy (correct vs incorrect) rather than the multiclass probability across all 5 options.</p><p>Model response repeatability was evaluated using the Fleiss &#x03BA; statistic, with results interpreted as follows: &#x003E;0.8, almost perfect; 0.61 to 0.80, substantial; 0.41 to 0.60, moderate; 0.21 to 0.40, fair; and &#x003C;0.20, poor [<xref ref-type="bibr" rid="ref27">27</xref>]. To assess potential training data contamination, majority-vote accuracy at a repetition count of 20 was compared between cases published before February 2025 (n=72) and from February to December 2025 (n=22) using Fisher exact tests. Statistical significance was established at <italic>P</italic>&#x003C;.05. Statistical analyses were performed using GraphPad Prism (version 10.4.1; GraphPad Software) and Python (version 3.10).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Dataset and Diagnostic Performance</title><p>A study of 94 cases was conducted, with a median of 4 (IQR 3-4; range 1&#x2010;5) input images per case. The distribution of input image modalities comprised radiography (n=8), ultrasonography (n=94), computed tomography (n=18), magnetic resonance imaging (n=26), nuclear medicine imaging (n=6), and other diagnostic techniques, including endoscopic visualization (n=2) and aspiration fluid (n=2). Human accuracy demonstrated substantial variability (median 59%, IQR 40.3%-75%; range 5%&#x2010;96%; mean 56.8%, SD 22%).</p><p><xref ref-type="table" rid="table1">Table 1</xref> and <xref ref-type="fig" rid="figure2">Figure 2</xref> demonstrate the implementation of majority voting across repetition counts. Claude-4.5-Sonnet showed a significant improvement in diagnostic accuracy with majority voting, improving from 48.94% (46/94) to 55.32% (52/94) at 10 repetitions (<italic>P</italic>=.01). In contrast, Gemini-3-Pro, GPT-5, and GPT-4o showed no significant change with majority voting (<italic>P</italic>=.67, <italic>P</italic>=.94, and <italic>P</italic>=.08, respectively), with Gemini-3-Pro showing the highest accuracy ranging from 72.34% (68/94) to 74.47% (70/94).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Comparison of first output and majority vote accuracy across multimodal large language models (N=94).<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model</td><td align="left" valign="top">First output, n (%)</td><td align="left" valign="top">Majority vote (5), n (%)</td><td align="left" valign="top">Majority vote (10), n (%)</td><td align="left" valign="top">Majority vote (15), n (%)</td><td align="left" valign="top">Majority vote (20), n (%)</td><td align="left" valign="top"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Claude-4.5-Sonnet</td><td align="left" valign="top">46 (48.94)</td><td align="left" valign="top">49 (52.13)</td><td align="left" valign="top">52 (55.32)</td><td align="left" valign="top">52 (55.32)</td><td align="left" valign="top">52 (55.32)</td><td align="left" valign="top">.01</td></tr><tr><td align="left" valign="top">Gemini-3-Pro</td><td align="left" valign="top">70 (74.47)</td><td align="left" valign="top">70 (74.47)</td><td align="left" valign="top">69 (73.40)</td><td align="left" valign="top">68 (72.34)</td><td align="left" valign="top">68 (72.34)</td><td align="left" valign="top">.67</td></tr><tr><td align="left" valign="top">GPT-5</td><td align="left" valign="top">63 (67.02)</td><td align="left" valign="top">64 (68)</td><td align="left" valign="top">65 (69.15)</td><td align="left" valign="top">65 (69.15)</td><td align="left" valign="top">64 (68.09)</td><td align="left" valign="top">.94</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">41 (43.62)</td><td align="left" valign="top">44 (46.81)</td><td align="left" valign="top">43 (45.74)</td><td align="left" valign="top">44 (46.81)</td><td align="left" valign="top">44 (46.81)</td><td align="left" valign="top">.08</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Differences in accuracy were assessed using the Cochran <italic>Q</italic> test. Human accuracy for these cases showed substantial variability (median 59%, IQR 40.3%-75%; mean 56.8%, SD 22%, range 5%-96%).</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Accuracy and &#x03BA; values plotted against repetition number.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86498_fig02.png"/></fig><p>Fleiss &#x03BA; analysis demonstrated substantial within-model repeatability across all models (Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Gemini-3-Pro achieved the highest consistency (&#x03BA;=0.79&#x2010;0.80), followed by GPT-4o (&#x03BA;=0.77&#x2010;0.79). Claude-4.5-Sonnet and GPT-5 showed comparable repeatability (&#x03BA;=0.73&#x2010;0.76 and &#x03BA;=0.73&#x2010;0.75, respectively). All models maintained substantial agreement across all repetition counts, with response consistency remaining stable from 5 to 20 repetitions (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><p>No statistically significant differences in majority-vote accuracy were observed between cases published before and after February 2025 for any of the 4 evaluated models (all <italic>P</italic>&#x003E;.79; Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-2"><title>Confidence Measurement Metrics</title><p>The discriminative ability of confidence metrics varied substantially across models (<xref ref-type="table" rid="table2">Table 2</xref>). Top Weighted Score demonstrated the highest discriminative performance, achieving significant ROC AUC values across all models. Gemini-3-Pro showed the strongest discrimination with Top Weighted Score (ROC AUC=0.826, 95% CI 0.731&#x2010;0.920, <italic>P</italic>&#x003C;.001), followed by GPT-5 (ROC AUC=0.767, 95% CI 0.668&#x2010;0.866, <italic>P</italic>&#x003C;.001), Claude-4.5-Sonnet (ROC AUC=0.676, 95% CI 0.568&#x2010;0.785, <italic>P</italic>=.001), and GPT-4o (ROC AUC=0.629, 95% CI 0.509&#x2010;0.749, <italic>P</italic>=.04). Notably, Top Weighted Score was the only metric achieving statistical significance in GPT-4o.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Receiver operating characteristic area under the curve comparing discriminative ability of confidence metrics in multimodal large language models.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Self-reported (first) (ROC<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup> AUC<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>, 95% CI, <italic>P</italic> value<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup>)</td><td align="left" valign="bottom">Self-reported (mean) (ROC AUC, 95% CI, <italic>P</italic> value)</td><td align="left" valign="bottom">R_H<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup> (ROC AUC, 95% CI, <italic>P</italic> value)<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="bottom">Majority-vote percentage (ROC AUC, 95% CI, <italic>P</italic> value)</td><td align="left" valign="bottom">Top Weighted Score (ROC AUC, 95% CI, <italic>P</italic> value)</td></tr></thead><tbody><tr><td align="left" valign="top">Claude-4.5-Sonnet</td><td align="left" valign="top">0.706, 0.602&#x2010;0.810, &#x003C;.001</td><td align="left" valign="top">0.636, 0.523&#x2010;0.748, .02</td><td align="left" valign="top">0.671, 0.565&#x2010;0.778, .002</td><td align="left" valign="top">0.668, 0.562&#x2010;0.775, .002</td><td align="left" valign="top">0.676, 0.568&#x2010;0.785, .001</td></tr><tr><td align="left" valign="top">Gemini-3-Pro</td><td align="left" valign="top">0.532, 0.439&#x2010;0.625, .50</td><td align="left" valign="top">0.661, 0.546&#x2010;0.775, .006</td><td align="left" valign="top">0.779, 0.672&#x2010;0.887, &#x003C;.001</td><td align="left" valign="top">0.790, 0.682&#x2010;0.897, &#x003C;.001</td><td align="left" valign="top">0.826, 0.731&#x2010;0.920, &#x003C;.001</td></tr><tr><td align="left" valign="top">GPT-5</td><td align="left" valign="top">0.719, 0.613&#x2010;0.826, &#x003C;.001</td><td align="left" valign="top">0.659, 0.547&#x2010;0.771, .005</td><td align="left" valign="top">0.755, 0.647&#x2010;0.863, &#x003C;.001</td><td align="left" valign="top">0.740, 0.631&#x2010;0.848, &#x003C;.001</td><td align="left" valign="top">0.767, 0.668&#x2010;0.866, &#x003C;.001</td></tr><tr><td align="left" valign="top">GPT-4o</td><td align="left" valign="top">0.597, 0.491&#x2010;0.703, .07</td><td align="left" valign="top">0.592, 0.476&#x2010;0.708, .12</td><td align="left" valign="top">0.576, 0.463&#x2010;0.689, .19</td><td align="left" valign="top">0.577, 0.464&#x2010;0.689, .18</td><td align="left" valign="top">0.629, 0.509&#x2010;0.749, .04</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>ROC: receiver operating characteristic.</p></fn><fn id="table2fn2"><p><sup>b</sup>AUC: area under the curve.</p></fn><fn id="table2fn3"><p><sup>c</sup>Statistically significant results (<italic>P</italic>&#x003C;.05) are marked with an asterisk.</p></fn><fn id="table2fn4"><p><sup>d</sup>R_H: relative entropy.</p></fn><fn id="table2fn5"><p><sup>e</sup>Shannon entropy is not reported separately because it yielded identical AUC values to the relative entropy&#x2013;based score.</p></fn></table-wrap-foot></table-wrap><p>Consistency-based metrics (R_H, majority-vote percentage, and Shannon entropy) showed strong discriminative ability in 3 models. Gemini-3-Pro achieved the highest performance (R_H, ROC AUC=0.779, 95% CI 0.672&#x2010;0.887, <italic>P</italic>&#x003C;.001; majority-vote percentage, ROC AUC=0.790, 95% CI 0.682&#x2010;0.897, <italic>P</italic>&#x003C;.001; Shannon entropy, ROC AUC=0.779, 95% CI 0.672&#x2010;0.887, <italic>P</italic>&#x003C;.001), followed by GPT-5 (R_H, ROC AUC=0.755, 95% CI 0.647&#x2010;0.863, <italic>P</italic>&#x003C;.001; majority-vote percentage, ROC AUC=0.740, 95% CI 0.631&#x2010;0.848, <italic>P</italic>&#x003C;.001; Shannon entropy, ROC AUC=0.755, 95% CI 0.647&#x2010;0.863, <italic>P</italic>&#x003C;.001) and Claude-4.5-Sonnet (R_H, ROC AUC=0.671, 95% CI 0.565&#x2010;0.778, <italic>P=</italic>.002; majority-vote percentage, ROC AUC=0.668, 95% CI 0.562&#x2010;0.775, <italic>P</italic>=.002; Shannon entropy, ROC AUC=0.672, 95% CI 0.565&#x2010;0.778, <italic>P</italic>=.002). However, GPT-4o showed no significant discrimination with consistency-based metrics (R_H, ROC AUC=0.576, <italic>P</italic>=.19; majority-vote percentage, ROC AUC=0.577, <italic>P</italic>=.18; Shannon entropy, ROC AUC=0.576, <italic>P</italic>=.19).</p><p>Self-reported confidence demonstrated model-dependent performance. First self-reported confidence achieved significant discrimination in Claude-4.5-Sonnet (ROC AUC=0.706, 95% CI 0.602&#x2010;0.810, <italic>P</italic>&#x003C;.001) and GPT-5 (ROC AUC=0.719, 95% CI 0.613&#x2010;0.826, <italic>P</italic>&#x003C;.001). However, mean self-reported confidence showed lower discriminative ability in Claude-4.5-Sonnet (ROC AUC=0.636, <italic>P</italic>=.02), GPT-5 (ROC AUC=0.659, <italic>P</italic>=.005), and Gemini-3-Pro (ROC AUC=0.661, <italic>P</italic>=.006).</p><p><xref ref-type="fig" rid="figure3">Figure 3</xref> replicates the ROC findings. Top Weighted Score demonstrated higher confidence scores for correct responses across all 4 models, whereas R_H, majority-vote percentage, first self-reported confidence, and mean self-reported confidence showed this pattern only in GPT-5, Claude-4.5-Sonnet, and Gemini-3-Pro, excluding GPT-4o.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Distribution of confidence scores stratified by diagnostic accuracy (correct vs incorrect) for each multimodal large language model. (A) Relative entropy (R_H), (B) majority-vote percentage, (C) first self-reported confidence, (D) mean self-reported confidence, and (E) Top Weighted Score. Solid bars indicate correct responses; hatched bars indicate incorrect responses. Differences between groups were assessed using the Mann-Whitney <italic>U</italic> test. *<italic>P</italic>&#x003C;.05; **<italic>P</italic>&#x003C;.01; ***<italic>P</italic>&#x003C;.001.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86498_fig03.png"/></fig><p>Spearman correlation analysis revealed similar patterns (<xref ref-type="table" rid="table3">Table 3</xref>). Notably, Top Weighted Score was the only metric achieving statistical significance across all 4 models, demonstrating moderate correlations in Gemini-3-Pro (&#x03C1;=0.52, 95% CI 0.35&#x2010;0.65, <italic>P</italic>&#x003C;.001) and GPT-5 (&#x03C1;=0.43, 95% CI 0.25&#x2010;0.58, <italic>P</italic>&#x003C;.001), and weak correlations in Claude-4.5-Sonnet (&#x03C1;=0.30, 95% CI 0.11&#x2010;0.48, <italic>P</italic>=.003) and GPT-4o (&#x03C1;=0.22, 95% CI 0.02&#x2010;0.41, <italic>P</italic>=.03). In GPT-4o, Top Weighted Score was the only metric achieving significance. Consistency-based metrics showed moderate correlations in Gemini-3-Pro (R_H, &#x03C1;=0.48; majority-vote percentage, &#x03C1;=0.50; all <italic>P</italic>&#x003C;.001) and GPT-5 (R_H, &#x03C1;=0.43; majority-vote percentage, &#x03C1;=0.41; all <italic>P</italic>&#x003C;.001). Self-reported confidence showed weak correlations across all models.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Correlation analysis between accuracy and confidence metrics in multimodal large language models at a repetition count of 20.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and metrics</td><td align="left" valign="bottom">&#x03C1; (95% CI)<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Claude-4.5-Sonnet</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.36 (0.17 to 0.53)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.23 (0.03 to 0.42)</td><td align="left" valign="top">.02</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup></td><td align="left" valign="top">0.31 (0.11 to 0.48)</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.30 (0.11 to 0.48)</td><td align="left" valign="top">.003</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.30 (0.11 to 0.48)</td><td align="left" valign="top">.003</td></tr><tr><td align="left" valign="top" colspan="3">Gemini-3-Pro</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.06 (&#x2212;0.14 to 0.26)</td><td align="left" valign="top">.55</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.25 (0.05 to 0.43)</td><td align="left" valign="top">.014</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H</td><td align="left" valign="top">0.48 (0.30 to 0.62)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.50 (0.33 to 0.64)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.52 (0.35 to 0.65)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="3">GPT-5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.36 (0.17 to 0.52)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.26 (0.06 to 0.44)</td><td align="left" valign="top">.012</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H</td><td align="left" valign="top">0.43 (0.25 to 0.58)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.41 (0.22 to 0.56)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.43 (0.25 to 0.58)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top" colspan="3">GPT-4o</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.18 (&#x2212;0.02 to 0.37)</td><td align="left" valign="top">.08</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.16 (&#x2212;0.05 to 0.35)</td><td align="left" valign="top">.13</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H</td><td align="left" valign="top">0.14 (&#x2212;0.07 to 0.33)</td><td align="left" valign="top">.18</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.14 (&#x2212;0.06 to 0.33)</td><td align="left" valign="top">.18</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.22 (0.02 to 0.41)</td><td align="left" valign="top">.03</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Values represent Spearman correlation coefficients (&#x03C1;) with 95% CIs in parentheses. </p></fn><fn id="table3fn2"><p><sup>b</sup>Significant correlations (<italic>P</italic>&#x003C;.05) are marked with an asterisk.</p></fn><fn id="table3fn3"><p><sup>c</sup>R_H: relative entropy.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>Calibration</title><p>Calibration analysis revealed that the Top Weighted Score demonstrated the best calibration in GPT-5 (ECE=0.098, 95% CI 0.074&#x2010;0.211; Brier score=0.185, 95% CI 0.140&#x2010;0.235) and Claude-4.5-Sonnet (ECE=0.192, 95% CI 0.133&#x2010;0.307; Brier score=0.259, 95% CI 0.203&#x2010;0.317). In Gemini-3-Pro, R_H and the Top Weighted Score showed comparable calibration (ECE=0.119 vs 0.122; Brier score=0.164 vs 0.163, respectively). Across Claude-4.5-Sonnet, Gemini-3-Pro, and GPT-5, consistency-based metrics, particularly R_H, demonstrated better calibration compared to self-reported metrics. GPT-4o showed the poorest calibration across all metrics (<xref ref-type="table" rid="table4">Table 4</xref>).</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Calibration metrics for different confidence measurement methods in multimodal large language models at a repetition count of 20.<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup></p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and metrics</td><td align="left" valign="bottom">ECE<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup> (95% CI)</td><td align="left" valign="bottom">Brier score (95% CI)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Claude-4.5-Sonnet</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.284 (0.195&#x2010;0.389)</td><td align="left" valign="top">0.317 (0.253&#x2010;0.385)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.340 (0.244&#x2010;0.439)</td><td align="left" valign="top">0.339 (0.275&#x2010;0.409)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td><td align="left" valign="top">0.266 (0.191&#x2010;0.375)</td><td align="left" valign="top">0.288 (0.216&#x2010;0.363)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.304 (0.226&#x2010;0.403)</td><td align="left" valign="top">0.321 (0.247&#x2010;0.399)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.192 (0.133&#x2010;0.307)</td><td align="left" valign="top">0.259 (0.203&#x2010;0.317)</td></tr><tr><td align="left" valign="top" colspan="3">Gemini-3-Pro</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.216 (0.128&#x2010;0.305)</td><td align="left" valign="top">0.243 (0.165&#x2010;0.322)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.208 (0.120&#x2010;0.298)</td><td align="left" valign="top">0.229 (0.152&#x2010;0.313)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H</td><td align="left" valign="top">0.119 (0.078&#x2010;0.210)</td><td align="left" valign="top">0.164 (0.109&#x2010;0.226)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.168 (0.109&#x2010;0.264)</td><td align="left" valign="top">0.178 (0.117&#x2010;0.244)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.122 (0.071&#x2010;0.212)</td><td align="left" valign="top">0.163 (0.109&#x2010;0.220)</td></tr><tr><td align="left" valign="top" colspan="3">GPT-5</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.172 (0.097&#x2010;0.273)</td><td align="left" valign="top">0.235 (0.171&#x2010;0.303)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.176 (0.096&#x2010;0.278)</td><td align="left" valign="top">0.230 (0.164&#x2010;0.296)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H</td><td align="left" valign="top">0.140 (0.099&#x2010;0.240)</td><td align="left" valign="top">0.191 (0.135&#x2010;0.252)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.206 (0.130&#x2010;0.299)</td><td align="left" valign="top">0.219 (0.154&#x2010;0.287)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.098 (0.074&#x2010;0.211)</td><td align="left" valign="top">0.185 (0.140&#x2010;0.235)</td></tr><tr><td align="left" valign="top" colspan="3">GPT-4o</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (mean)</td><td align="left" valign="top">0.436 (0.329&#x2010;0.543)</td><td align="left" valign="top">0.436 (0.349&#x2010;0.523)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Self-reported (first)</td><td align="left" valign="top">0.468 (0.368&#x2010;0.570)</td><td align="left" valign="top">0.459 (0.380&#x2010;0.542)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>R_H</td><td align="left" valign="top">0.377 (0.303&#x2010;0.494)</td><td align="left" valign="top">0.397 (0.313&#x2010;0.482)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Majority-vote percentage</td><td align="left" valign="top">0.413 (0.332&#x2010;0.535)</td><td align="left" valign="top">0.435 (0.345&#x2010;0.525)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Top Weighted Score</td><td align="left" valign="top">0.356 (0.272&#x2010;0.470)</td><td align="left" valign="top">0.368 (0.295&#x2010;0.440)</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Lower values indicate better calibration. ECE was calculated using 10 bins.</p></fn><fn id="table4fn2"><p><sup>b</sup>ECE: expected calibration error.</p></fn><fn id="table4fn3"><p><sup>c</sup>R_H: relative entropy.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Repetition Count Analysis</title><p>Correlation analysis stratified by repetition count revealed model-specific patterns in the relationship between consistency-based metrics and diagnostic accuracy (Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Gemini-3-Pro showed strengthening correlations with increasing repetitions, with majority-vote percentage and R_H improving from weak (&#x03C1;=0.35, <italic>P&#x003C;</italic>.001) at 5 repetitions to moderate (&#x03C1;=0.51&#x2010;0.52, <italic>P</italic>&#x003C;.001) at 15 repetitions, and Top Weighted Score improving from weak (&#x03C1;=0.32, <italic>P</italic>=.002) to moderate (&#x03C1;=0.46, <italic>P</italic>&#x003C;.001). Mean self-reported confidence in Gemini-3-Pro achieved statistical significance only after 10 repetitions. In contrast, Claude-4.5-Sonnet demonstrated significant weak correlations across all metrics after 5 repetitions. GPT-5 showed improvement in R_H from weak (&#x03C1;=0.38, <italic>P</italic>&#x003C;.001) at 5 repetitions to moderate (&#x03C1;=0.41, <italic>P</italic>&#x003C;.001) at 15 repetitions, while Top Weighted Score maintained moderate correlations across all repetition counts (&#x03C1;=0.40&#x2010;0.51; all <italic>P</italic>&#x003C;.001). GPT-4o showed nonsignificant correlations across all metrics regardless of repetition count (<xref ref-type="fig" rid="figure4">Figure 4</xref>).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Heatmap of Spearman correlation (&#x03C1;) and calibration (ECE) between diagnostic accuracy and confidence metrics across multimodal large language models by repetition count. ECE: expected calibration error; R_H: relative entropy.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e86498_fig04.png"/></fig><p>Calibration metrics showed similar patterns with increasing repetitions (Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Top Weighted Score demonstrated decreasing ECE values across repetitions in Gemini-3-Pro (0.158 to 0.112), GPT-5 (0.118 to 0.095), and Claude-4.5-Sonnet (0.230 to 0.184).</p><p>Heatmaps display Spearman correlation coefficients (&#x03C1;; top row, panels A-D) and ECE (bottom row, panels E-H) for 4 confidence metrics across repetition counts of 5, 10, 15, and 20. For correlation, darker shading indicates a stronger positive correlation (range 0&#x2010;1); for ECE, darker shading indicates better calibration with lower error (range 0&#x2010;0.5). Statistically significant correlations (<italic>P</italic>&#x003C;.05) are marked with an asterisk. Shannon entropy exhibited inverse correlations with identical magnitudes to R_H values (data not shown).</p></sec><sec id="s3-5"><title>Resource Use</title><p>Resource consumption varied substantially across models (Table S6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). GPT-4o demonstrated the highest efficiency, requiring a mean processing time of 6.38 (SD 2.26) seconds and a mean of 4353 (SD 1402) total tokens per case at 1 repetition, increasing to 59.43 (SD 12.46) seconds and 43,423 (SD 13,921) tokens at 10 repetitions, and 117.93 (SD 23.44) seconds and 86,877 (SD 27,869) tokens at 20 repetitions. Claude-4.5-Sonnet required a mean processing time of 29.48 (SD 6.37) seconds and a mean of 5551 (SD 1477) total tokens at 1 repetition, increasing to 288.27 (SD 44.76) seconds and 55,207 (SD 14,505) tokens at 10 repetitions, and 750.43 (SD 1694.80) seconds and 110,344 (SD 28,860) tokens at 20 repetitions. Gemini-3-Pro consumed a mean processing time of 54.75 (SD 29.79) seconds and a mean of 8774 (SD 2993) total tokens at 1 repetition, increasing to 543.87 (SD 256.35) seconds and 88,621 (SD 26,743) tokens at 10 repetitions, and 1103.97 (SD 528.83) seconds and 178,114 (SD 54,325) tokens at 20 repetitions. GPT-5 showed the highest resource consumption, requiring a mean processing time of 70.66 (SD 39.39) seconds and a mean of 6348 (SD 2026) total tokens at 1 repetition, increasing to 947.53 (SD 1808.41) seconds and 63,956 (SD 17,991) tokens at 10 repetitions, and 1768.43 (SD 2059.84) seconds and 127,879 (SD 35,210) tokens at 20 repetitions. Notably, sporadic extreme processing times exceeding 500 seconds per individual API call were observed in Claude-4.5-Sonnet (2/1880, 0.11%) calls and GPT-5 (5/1880, 0.27%) calls, whereas no such events occurred in Gemini-3-Pro or GPT-4o.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our findings indicate that the Top Weighted Score, a composite metric integrating response consistency with self-reported confidence, provided the most consistent assessment of multimodal LLM output reliability for ultrasound-based radiological cases. Notably, it was the only metric to demonstrate statistically significant correlations across all 4 evaluated models, and it exhibited the best calibration in most models. In parallel, consistency-based metrics (R_H, majority-vote percentage, and Shannon entropy) showed strong discriminative performance in Gemini-3-Pro, GPT-5, and Claude-4.5-Sonnet when contrasted with self-reported confidence, underscoring the value of response agreement&#x2013;derived signals for reliability estimation.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>These observations align with prior studies suggesting that consistency-based calibration approaches can outperform post hoc verbalized confidence methods for estimating LLM uncertainty and reliability [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. In radiology, Huppertz et al [<xref ref-type="bibr" rid="ref30">30</xref>] similarly reported no significant association between verbalized confidence and diagnostic accuracy, with accuracy remaining below 50% even at the highest confidence scores. However, in our study, first-response confidence showed relatively high ROC AUC values in GPT-5 and Claude-4.5-Sonnet, whereas mean self-reported confidence demonstrated lower values; notably, Gemini-3-Pro showed the opposite pattern. This inconsistency suggests that the apparent first-response advantage may have been driven by sampling variability. Accordingly, repeated averaging of verbalized confidence scores represents an averaging of scores only partially aligned with actual correctness rather than a more accurate probability estimate [<xref ref-type="bibr" rid="ref4">4</xref>], and this process may compress case-level variance and reduce discriminative ability. Given the intrinsically stochastic nature of LLM generation, reliance on a single initial output for confidence estimation poses significant risks in clinical settings.</p><p>Interestingly, the entropy-based metrics achieved lower ECE and Brier scores than the simple majority-vote percentage metric. This suggests that model response dispersion, rather than only the most frequent response, yields better-calibrated confidence estimates. Given the high &#x03BA; values observed in our study, the benefit of entropy-based metrics is likely to increase in settings with greater response diversity, such as tasks with larger multiple-choice panels or free-text outputs. However, high interresponse agreement (&#x03BA;&#x2248;0.7&#x2010;0.8) and fixed sampling parameters (eg, temperature=1) may inflate confidence and reduce discriminative power.</p><p>Additionally, although not statistically significant, Gemini-3-Pro, the highest-performing model, showed a decrease in accuracy with majority voting at 15 and 20 repetitions. This may partly reflect the systematic nature of its errors across repetitions. When interresponse agreement is high (&#x03BA;=0.79&#x2010;0.80), incorrect responses tend to be consistent across repetitions, which may limit the corrective potential of majority voting and contribute to the marginal decrease in accuracy observed.</p><p>Despite broad improvements in capability across successive model releases, recent studies across domains indicate that newer models can retain a systematic tendency toward overconfidence [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. Consistent with this literature, we observed that models frequently assigned high confidence to incorrect answers, which poses a direct challenge for deployment in clinical decision support, where confidently presented errors may be disproportionately persuasive and propagate into downstream decision-making [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. Several recent studies have reported that hybrid approaches integrating consistency and verbalized confidence can outperform either method alone in certain models [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. In our study, Top Weighted Score operationalizes a related principle by weighting candidate responses using both frequency and self-reported confidence, potentially mitigating 2 complementary limitations: (1) overconfidence inherent to verbalized estimates and (2) reduced sensitivity of pure consistency metrics when interresponse agreement is high. The consistent performance of Top Weighted Score across all tested models supports the utility of such integrative formulations for multimodal radiology tasks.</p><p>A major practical limitation of consistency-based estimation is resource intensity, as multiple independent model executions are required to compute agreement and dispersion metrics, increasing API costs and processing time [<xref ref-type="bibr" rid="ref11">11</xref>]. In our study, processing time varied substantially by model and repetition count, with the burden particularly evident for reasoning models, where extended reasoning traces can materially increase latency and necessitate explicit cost-benefit trade-offs. To mitigate this overhead, adaptive sampling strategies have been proposed that achieve comparable accuracy with significantly reduced computational costs [<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. In our repetition-depth analysis, approximately 10 resampling runs per case were sufficient to stabilize discrimination and calibration estimates. Further research is needed to establish minimal sampling schedules for each model, along with inference acceleration strategies leveraging recent advances [<xref ref-type="bibr" rid="ref37">37</xref>], to balance confidence estimation reliability with resource efficiency. Furthermore, sporadic extreme processing times were observed in Claude-4.5-Sonnet and GPT-5, likely reflecting potential API latency or timeout events. Although this processing time instability affected fewer than 0.3% of total API calls, it disproportionately inflated processing time variability and may represent a real-world barrier to clinical deployment, where predictable response times are essential.</p></sec><sec id="s4-3"><title>Limitations</title><p>This study exhibited several limitations. First, our relatively small sample size (N=94) potentially restricts the generalizability of our findings, and the fixed 10-bin ECE may be unstable in this sample, as confidence score clustering may leave some bins sparsely populated. Second, because model performance was assessed with multiple-choice questions, the evaluation does not fully represent real-world clinical scenarios that typically require free-text responses. Third, our evaluation was limited to the most widely available closed-source models and did not include open-source models that might have high potential applicability in health care. Fourth, as the KSUM educational quizzes are publicly accessible online, there is a possibility that these materials may have been included in the training corpora of the evaluated LLMs. This potential data contamination could inflate model accuracy estimates. However, a temporal holdout analysis revealed no significant accuracy differences between precutoff and postcutoff cases for any model (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Furthermore, our previous investigation using the same dataset demonstrated no significant performance differences for GPT-4o between cases published before versus after the model&#x2019;s knowledge cutoff date [<xref ref-type="bibr" rid="ref38">38</xref>], suggesting that even if contamination occurred, its impact on model performance may be negligible given the vast scale of training parameters. Fifth, a practical limitation of consistency-based estimation is resource intensity, as multiple independent model executions are required, increasing costs and processing time. To address this, we conducted our repetition count analysis, and approximately 10 resampling runs per case appeared to be sufficient to stabilize discrimination and calibration estimates. Sixth, although processing times indicated that reasoning models engaged in extended computation, the structured JSON output requirement may have partially constrained the models&#x2019; chain-of-thought process, potentially affecting confidence calibration. Additionally, our prompt was designed such that confidence values were embedded as natural language strings within the JSON output rather than as direct numerical fields, which may introduce parsing instability if models generate slight textual variations. Seventh, our evaluation used general-purpose multimodal LLMs. Although recent studies have evaluated general-purpose multimodal LLMs on radiological cases, these investigations predominantly focus on diagnostic accuracy [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. The development of radiologic domain-specific multimodal LLMs is in its early stages [<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref41">41</xref>], and such models are not yet widely available commercially. Building upon these studies of general-purpose LLMs, our findings may contribute to the future development and evaluation of domain-specific expert models. Finally, the Top Weighted Score was developed post hoc on the study dataset, which introduces a potential risk of overfitting. Therefore, external validation using independent datasets is necessary to confirm the generalizability of this hybrid metric.</p></sec><sec id="s4-4"><title>Future Directions</title><p>Several directions for future research emerge from our findings. First, validation with larger datasets across various radiological conditions would strengthen generalizability and potentially identify additional patterns in model performance and calibration. Second, incorporating free-text tasks would better approximate real-world clinical usage. Third, comparative studies should explore whether open-source models demonstrate similar confidence calibration patterns and assess their applicability in clinical environments. Fourth, further research is needed to establish minimal sampling schedules and develop adaptive sampling strategies to balance confidence estimation reliability with resource efficiency. Additionally, studies using models with accessible internal reasoning processes could enable direct entropy computation from log probabilities, potentially reducing computational overhead while maintaining calibration accuracy. Fifth, comparative studies evaluating human reader confidence alongside LLM confidence metrics during diagnostic tasks could provide valuable insights into the potential for LLMs to augment clinical decision-making. Finally, from a clinical perspective, confidence metrics could augment human decision-making by providing reliability indicators for LLM outputs; however, clinical validation studies are needed to evaluate the practical use of these metrics in real-world diagnostic settings.</p></sec><sec id="s4-5"><title>Conclusions</title><p>In multimodal LLMs applied to ultrasound-based radiology cases, hybrid methods (Top Weighted Score) demonstrated significant associations across all evaluated models and appear to serve as more reliable indicators of diagnostic confidence compared to self-reported or consistency-based metrics alone, although the strength of this association varied across models, and external validation is warranted before broader clinical application.</p></sec></sec></body><back><ack><p>The authors used Claude-4.5-Sonnet (Anthropic) for language editing and grammar checking during manuscript preparation. All content was verified and approved by the authors. The large language models evaluated in this study (GPT-4o, GPT-5, Claude-4.5-Sonnet, and Gemini-3-Pro) are described in the Methods section.</p></ack><notes><sec><title>Funding</title><p>This work was supported by a grant from the National Research Foundation of Korea (NRF), funded by the Korean government (MSIT) (grant RS-2025-00516874).</p></sec><sec><title>Data Availability</title><p>The research data supporting the findings of this study are publicly available through the Korean Society of Ultrasound in Medicine (KSUM) [<xref ref-type="bibr" rid="ref16">16</xref>]. This study used these open-source datasets for analysis. The underlying code for this study, including calibration analysis, is provided in Supplementary Material 1 in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> and will be made publicly available upon publication.</p></sec></notes><fn-group><fn fn-type="con"><p>TH and JS conceived and designed the study. TH, JHL, and KG acquired the data. TH, JHL, KG, and JS analyzed and interpreted the data. TH and JS prepared the first draft of the manuscript. All authors critically revised the manuscript for important intellectual content, approved the final manuscript, and agreed to be accountable for all aspects of the work.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">API</term><def><p>application programming interface</p></def></def-item><def-item><term id="abb2">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb3">ECE</term><def><p>expected calibration error</p></def></def-item><def-item><term id="abb4">KSUM</term><def><p>Korean Society of Ultrasound in Medicine</p></def></def-item><def-item><term id="abb5">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb6">R_H</term><def><p>relative entropy</p></def></def-item><def-item><term id="abb7">ROC</term><def><p>receiver operating characteristic</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Entwistle</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pfeffer</surname><given-names>MA</given-names> </name></person-group><article-title>Creation and adoption of large language models in medicine</article-title><source>JAMA</source><year>2023</year><month>09</month><day>5</day><volume>330</volume><issue>9</issue><fpage>866</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.14217</pub-id><pub-id pub-id-type="medline">37548965</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Evaluating GPT-v4 (GPT-4 with vision) on detection of radiologic findings on chest radiographs</article-title><source>Radiology</source><year>2024</year><month>05</month><volume>311</volume><issue>2</issue><fpage>e233270</fpage><pub-id pub-id-type="doi">10.1148/radiol.233270</pub-id><pub-id pub-id-type="medline">38713028</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xiong</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>J</given-names> </name></person-group><article-title>Can LLMs express their uncertainty? An empirical evaluation of confidence elicitation in LLMs</article-title><access-date>2026-05-22</access-date><conf-name>12th International Conference on Learning Representations, ICLR 2024</conf-name><conf-date>May 7-11, 2024</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://researchportal.hkust.edu.hk/en/publications/can-llms-express-their-uncertainty-an-empirical-evaluation-of-con/?">https://researchportal.hkust.edu.hk/en/publications/can-llms-express-their-uncertainty-an-empirical-evaluation-of-con/?</ext-link></comment></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>C</given-names> </name><name name-style="western"><surname>Pleiss</surname><given-names>G</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Weinberger</surname><given-names>KQ</given-names> </name></person-group><article-title>On calibration of modern neural networks</article-title><access-date>2026-05-22</access-date><conf-name>ICML&#x2019;17: Proceedings of the 34th International Conference on Machine Learning</conf-name><conf-date>Aug 6-11, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v70/guo17a/guo17a.pdf">https://proceedings.mlr.press/v70/guo17a/guo17a.pdf</ext-link></comment></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haltaufderheide</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ranisch</surname><given-names>R</given-names> </name></person-group><article-title>The ethics of ChatGPT in medicine and healthcare: a systematic review on large language models (LLMs)</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>8</day><volume>7</volume><issue>1</issue><fpage>183</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01157-x</pub-id><pub-id pub-id-type="medline">38977771</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Gal</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ghahramani</surname><given-names>Z</given-names> </name></person-group><article-title>Dropout as a Bayesian approximation: representing model uncertainty in deep learning</article-title><access-date>2026-05-22</access-date><conf-name>ICML&#x2019;16: Proceedings of the 33rd International Conference on International Conference on Machine Learning</conf-name><conf-date>Jun 19-24, 2016</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v48/gal16.pdf">https://proceedings.mlr.press/v48/gal16.pdf</ext-link></comment></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Begoli</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bhattacharya</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kusnezov</surname><given-names>D</given-names> </name></person-group><article-title>The need for uncertainty quantification in machine-assisted medical decision making</article-title><source>Nat Mach Intell</source><year>2019</year><month>01</month><day>7</day><volume>1</volume><issue>1</issue><fpage>20</fpage><lpage>23</lpage><pub-id pub-id-type="doi">10.1038/s42256-018-0004-1</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kompa</surname><given-names>B</given-names> </name><name name-style="western"><surname>Snoek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Beam</surname><given-names>AL</given-names> </name></person-group><article-title>Second opinion needed: communicating uncertainty in medical machine learning</article-title><source>NPJ Digit Med</source><year>2021</year><month>01</month><day>5</day><volume>4</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00367-3</pub-id><pub-id pub-id-type="medline">33402680</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Agbareia</surname><given-names>R</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Benchmarking the confidence of large language models in clinical questions</article-title><source>medRxiv</source><comment>Preprint posted online on  Sep 10, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.08.11.24311810</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Large language model uncertainty measurement and calibration for medical diagnosis and treatment</article-title><source>medRxiv</source><comment>Preprint posted online on  Jun 7, 2024</comment><pub-id pub-id-type="doi">10.1101/2024.06.06.24308399</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tian</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mitchell</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Just ask for calibration: strategies for eliciting calibrated confidence scores from language models fine-tuned with human feedback</article-title><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 6-10, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.330</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Tsai</surname><given-names>YHH</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>M</given-names> </name></person-group><article-title>On verbalized confidence scores for LLMs</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.14737</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Raj</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rosati</surname><given-names>D</given-names> </name><name name-style="western"><surname>Majumdar</surname><given-names>S</given-names> </name></person-group><article-title>Measuring reliability of large language models through semantic consistency</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 10, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2211.05853</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Fadeeva</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rubashevskii</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shelmanov</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Fact-checking the output of large language models via token-level uncertainty quantification</article-title><conf-name>Findings of the Association for Computational Linguistics: ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.558</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>Education</article-title><source>Korean Society of Ultrasound in Medicine (KSUM)</source><access-date>2026-05-23</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ksum.or.kr/education/index.php">https://www.ksum.or.kr/education/index.php</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fry</surname><given-names>A</given-names> </name><name name-style="western"><surname>Perelman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Tart</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ganesh</surname><given-names>A</given-names> </name><name name-style="western"><surname>El-Kishky</surname><given-names>A</given-names> </name></person-group><article-title>OpenAI GPT-5 system card</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 19, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2601.03267</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Introducing Claude Sonnet 4.5</article-title><source>Anthropic</source><year>2025</year><access-date>2026-05-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-sonnet-4-5">https://www.anthropic.com/news/claude-sonnet-4-5</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Model cards</article-title><source>Google DeepMind</source><year>2025</year><access-date>2026-05-22</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://deepmind.google/models/model-cards/">https://deepmind.google/models/model-cards/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hurst</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lerer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Goucher</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Perelman</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ramesh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>A</given-names> </name></person-group><article-title>GPT-4o system card</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 25, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2410.21276</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suh</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Shim</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>CH</given-names> </name><etal/></person-group><article-title>Comparing diagnostic accuracy of radiologists versus GPT-4V and Gemini Pro Vision using image inputs from Diagnosis Please cases</article-title><source>Radiology</source><year>2024</year><month>07</month><volume>312</volume><issue>1</issue><fpage>e240273</fpage><pub-id pub-id-type="doi">10.1148/radiol.240273</pub-id><pub-id pub-id-type="medline">38980179</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cochran</surname><given-names>WG</given-names> </name></person-group><article-title>The comparison of percentages in matched samples</article-title><source>Biometrika</source><year>1950</year><month>12</month><volume>37</volume><issue>3-4</issue><fpage>256</fpage><lpage>266</lpage><pub-id pub-id-type="doi">10.1093/biomet/37.3-4.256</pub-id><pub-id pub-id-type="medline">14801052</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>DeLong</surname><given-names>ER</given-names> </name><name name-style="western"><surname>DeLong</surname><given-names>DM</given-names> </name><name name-style="western"><surname>Clarke-Pearson</surname><given-names>DL</given-names> </name></person-group><article-title>Comparing the areas under two or more correlated receiver operating characteristic curves: a nonparametric approach</article-title><source>Biometrics</source><year>1988</year><month>09</month><volume>44</volume><issue>3</issue><fpage>837</fpage><lpage>845</lpage><pub-id pub-id-type="doi">10.2307/2531595</pub-id><pub-id pub-id-type="medline">3203132</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schober</surname><given-names>P</given-names> </name><name name-style="western"><surname>Boer</surname><given-names>C</given-names> </name><name name-style="western"><surname>Schwarte</surname><given-names>LA</given-names> </name></person-group><article-title>Correlation coefficients: appropriate use and interpretation</article-title><source>Anesth Analg</source><year>2018</year><month>05</month><volume>126</volume><issue>5</issue><fpage>1763</fpage><lpage>1768</lpage><pub-id pub-id-type="doi">10.1213/ANE.0000000000002864</pub-id><pub-id pub-id-type="medline">29481436</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Rivera</surname><given-names>M</given-names> </name><name name-style="western"><surname>Godbout</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Rabbany</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pelrine</surname><given-names>K</given-names> </name></person-group><article-title>Combining confidence elicitation and sample-based methods for uncertainty quantification in misinformation mitigation</article-title><conf-name>Proceedings of the 1st Workshop on Uncertainty-Aware NLP (UncertaiNLP 2024)</conf-name><conf-date>Mar 22, 2024</conf-date><pub-id pub-id-type="doi">10.18653/v1/2024.uncertainlp-1.12</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brier</surname><given-names>GW</given-names> </name></person-group><article-title>Verification of forecasts expressed in terms of probability</article-title><source>Mon Wea Rev</source><year>1950</year><month>01</month><volume>78</volume><issue>1</issue><fpage>1</fpage><lpage>3</lpage><pub-id pub-id-type="doi">10.1175/1520-0493(1950)078&#x003C;0001:VOFEIT&#x003E;2.0.CO;2</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Landis</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Koch</surname><given-names>GG</given-names> </name></person-group><article-title>The measurement of observer agreement for categorical data</article-title><source>Biometrics</source><year>1977</year><month>03</month><volume>33</volume><issue>1</issue><fpage>159</fpage><lpage>174</lpage><pub-id pub-id-type="doi">10.2307/2529310</pub-id><pub-id pub-id-type="medline">843571</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>de Oliveira</surname><given-names>R</given-names> </name><name name-style="western"><surname>Garber</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gwinnutt</surname><given-names>JM</given-names> </name><etal/></person-group><article-title>A study of calibration as a measurement of trustworthiness of large language models in biomedical natural language processing</article-title><source>JAMIA Open</source><year>2025</year><volume>8</volume><issue>4</issue><fpage>ooaf058</fpage><pub-id pub-id-type="doi">10.1093/jamiaopen/ooaf058</pub-id><pub-id pub-id-type="medline">40655536</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lyu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Shridhar</surname><given-names>K</given-names> </name><name name-style="western"><surname>Malaviya</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Calibrating large language models with sample consistency</article-title><conf-name>Thirty-Ninth AAAI Conference on Artificial Intelligence (AAAI-25)</conf-name><conf-date>Feb 25 to Mar 4, 2025</conf-date><pub-id pub-id-type="doi">10.1609/aaai.v39i18.34120</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huppertz</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Siepmann</surname><given-names>R</given-names> </name><name name-style="western"><surname>Topp</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Revolution or risk? Assessing the potential and challenges of GPT-4V in radiologic image interpretation</article-title><source>Eur Radiol</source><year>2025</year><month>03</month><volume>35</volume><issue>3</issue><fpage>1111</fpage><lpage>1121</lpage><pub-id pub-id-type="doi">10.1007/s00330-024-11115-6</pub-id><pub-id pub-id-type="medline">39422726</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Naderi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Safavi-Naini</surname><given-names>SAA</given-names> </name><name name-style="western"><surname>Savage</surname><given-names>T</given-names> </name><name name-style="western"><surname>Atf</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Lewis</surname><given-names>P</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>G</given-names> </name></person-group><article-title>Self-reported confidence of large language models in gastroenterology: analysis of commercial, open-source, and quantized models</article-title><source>arXiv</source><comment>Preprint posted online on  May 24, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2503.18562</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vashurin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Fadeeva</surname><given-names>E</given-names> </name><name name-style="western"><surname>Vazhentsev</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Benchmarking uncertainty quantification methods for large language models with LM-Polygraph</article-title><source>Trans Assoc Comput Linguist</source><year>2025</year><month>03</month><day>19</day><volume>13</volume><fpage>220</fpage><lpage>248</lpage><pub-id pub-id-type="doi">10.1162/tacl_a_00737</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nassar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hijazi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Generating credible referenced medical research: a comparative study of OpenAI&#x2019;s GPT-4 and Google&#x2019;s Gemini</article-title><source>Comput Biol Med</source><year>2025</year><month>02</month><volume>185</volume><fpage>109545</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.109545</pub-id><pub-id pub-id-type="medline">39667055</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Azamfirei</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kudchadkar</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Fackler</surname><given-names>J</given-names> </name></person-group><article-title>Large language models and the perils of their hallucinations</article-title><source>Crit Care</source><year>2023</year><month>03</month><day>21</day><volume>27</volume><issue>1</issue><fpage>120</fpage><pub-id pub-id-type="doi">10.1186/s13054-023-04393-x</pub-id><pub-id pub-id-type="medline">36945051</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Taubenfeld</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sheffer</surname><given-names>T</given-names> </name><name name-style="western"><surname>Ofek</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Confidence improves self-consistency in LLMs</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Jul 27 to Aug 1, 2025</conf-date><pub-id pub-id-type="doi">10.18653/v1/2025.findings-acl.1030</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Aggarwal</surname><given-names>P</given-names> </name><name name-style="western"><surname>Madaan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name></person-group><article-title>Let&#x2019;s sample step by step: adaptive-consistency for efficient reasoning and coding with LLMs</article-title><conf-name>Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing</conf-name><conf-date>Dec 6-10, 2023</conf-date><pub-id pub-id-type="doi">10.18653/v1/2023.emnlp-main.761</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>He</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Xue</surname><given-names>J</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>L</given-names> </name></person-group><article-title>WaferLLM: large language model inference at Wafer scale</article-title><access-date>2026-05-22</access-date><conf-name>19th USENIX Symposium on Operating Systems Design and Implementation (OSDI &#x2019;25)</conf-name><conf-date>Jul 7-9, 2025</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.usenix.org/system/files/osdi25-he.pdf">https://www.usenix.org/system/files/osdi25-he.pdf</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>WK</given-names> </name><name name-style="western"><surname>Shin</surname><given-names>J</given-names> </name></person-group><article-title>Diagnostic performance of multimodal large language models in radiological quiz cases: the effects of prompt engineering and input conditions</article-title><source>Ultrasonography</source><year>2025</year><month>05</month><volume>44</volume><issue>3</issue><fpage>220</fpage><lpage>231</lpage><pub-id pub-id-type="doi">10.14366/usg.25012</pub-id><pub-id pub-id-type="medline">40235070</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schramm</surname><given-names>S</given-names> </name><name name-style="western"><surname>Preis</surname><given-names>S</given-names> </name><name name-style="western"><surname>Metz</surname><given-names>MC</given-names> </name><etal/></person-group><article-title>Impact of multimodal prompt elements on diagnostic performance of GPT-4V in challenging brain MRI cases</article-title><source>Radiology</source><year>2025</year><month>01</month><volume>314</volume><issue>1</issue><fpage>e240689</fpage><pub-id pub-id-type="doi">10.1148/radiol.240689</pub-id><pub-id pub-id-type="medline">39835982</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>EK</given-names> </name><name name-style="western"><surname>Ham</surname><given-names>J</given-names> </name><name name-style="western"><surname>Roh</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Diagnostic accuracy and clinical value of a domain-specific multimodal generative AI model for chest radiograph report generation</article-title><source>Radiology</source><year>2025</year><month>03</month><volume>314</volume><issue>3</issue><fpage>e241476</fpage><pub-id pub-id-type="doi">10.1148/radiol.241476</pub-id><pub-id pub-id-type="medline">40131111</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hong</surname><given-names>EK</given-names> </name><name name-style="western"><surname>Roh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Park</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Value of using a generative AI model in chest radiography reporting: a reader study</article-title><source>Radiology</source><year>2025</year><month>03</month><volume>314</volume><issue>3</issue><fpage>e241646</fpage><pub-id pub-id-type="doi">10.1148/radiol.241646</pub-id><pub-id pub-id-type="medline">40067108</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary table.</p><media xlink:href="jmir_v28i1e86498_app1.docx" xlink:title="DOCX File, 34 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Calibration new analysis code.</p><media xlink:href="jmir_v28i1e86498_app2.txt" xlink:title="TXT File, 8 KB"/></supplementary-material></app-group></back></article>