<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e64452</article-id><article-id pub-id-type="doi">10.2196/64452</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of Large Language Models in Numerical Versus Semantic Medical Knowledge: Cross-Sectional Benchmarking Study on Evidence-Based Questions and Answers</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Avnat</surname><given-names>Eden</given-names></name><degrees>MPH, MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Levy</surname><given-names>Michal</given-names></name><degrees>BCS, MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Herstain</surname><given-names>Daniel</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yanko</surname><given-names>Elia</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ben Joya</surname><given-names>Daniel</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Tzuchman Katz</surname><given-names>Michal</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Eshel</surname><given-names>Dafna</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Laros</surname><given-names>Sahar</given-names></name><degrees>BMedSci</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dagan</surname><given-names>Yael</given-names></name><degrees>BMedSci</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Barami</surname><given-names>Shahar</given-names></name><degrees>BMedSci</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Mermelstein</surname><given-names>Joseph</given-names></name><degrees>BCS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ovadia</surname><given-names>Shahar</given-names></name><degrees>MCS</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shomron</surname><given-names>Noam</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shalev</surname><given-names>Varda</given-names></name><degrees>MD, MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Abdulnour</surname><given-names>Raja-Elie E</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib></contrib-group><aff id="aff1"><institution>Faculty of Medicine, Tel Aviv University</institution><addr-line>Chaim Levanon St 55</addr-line><addr-line>Tel Aviv</addr-line><country>Israel</country></aff><aff id="aff2"><institution>Kahun Medical Ltd</institution><addr-line>Givatayim</addr-line><country>Israel</country></aff><aff id="aff3"><institution>Faculty of Medicine, Hebrew University of Jerusalem</institution><addr-line>Jerusalem</addr-line><country>Israel</country></aff><aff id="aff4"><institution>School of Computer Science and Engineering, The Hebrew University of Jerusalem</institution><addr-line>Jerusalem</addr-line><country>Israel</country></aff><aff id="aff5"><institution>The Azrieli Faculty of Medicine, Bar-Ilan University</institution><addr-line>Safed</addr-line><country>Israel</country></aff><aff id="aff6"><institution>Kaplan Medical Center</institution><addr-line>Rehovot</addr-line><country>Israel</country></aff><aff id="aff7"><institution>Division of Pulmonary and Critical Care Medicine, Department of Medicine, Brigham and Women&#x2019;s Hospital, Harvard Medical School</institution><addr-line>Boston</addr-line><addr-line>MA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Thies</surname><given-names>Bill</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ling Ong</surname><given-names>Jasmine Chiat</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Eden Avnat, MPH, MD, Faculty of Medicine, Tel Aviv University, Chaim Levanon St 55, Tel Aviv, 6997801, Israel, 972 545299622; <email>edenavnat@mail.tau.ac.il</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>14</day><month>7</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e64452</elocation-id><history><date date-type="received"><day>17</day><month>07</month><year>2024</year></date><date date-type="rev-recd"><day>25</day><month>02</month><year>2025</year></date><date date-type="accepted"><day>06</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Eden Avnat, Michal Levy, Daniel Herstain, Elia Yanko, Daniel Ben Joya, Michal Tzuchman Katz, Dafna Eshel, Sahar Laros, Yael Dagan, Shahar Barami, Joseph Mermelstein, Shahar Ovadia, Noam Shomron, Varda Shalev, Raja-Elie E Abdulnour. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 14.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e64452"/><abstract><sec><title>Background</title><p>Clinical problem-solving requires processing of semantic medical knowledge, such as illness scripts, and numerical medical knowledge of diagnostic tests for evidence-based decision-making. As large language models (LLMs) show promising results in many aspects of language-based clinical practice, their ability to generate nonlanguage evidence-based answers to clinical questions is inherently limited by tokenization.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate LLMs&#x2019; performance on two question types: numeric (correlating findings) and semantic (differentiating entities), while examining differences within and between LLMs in medical aspects and comparing their performance to humans.</p></sec><sec sec-type="methods"><title>Methods</title><p>To generate straightforward multichoice questions and answers (Q and As) based on evidence-based medicine (EBM), we used a comprehensive medical knowledge graph (containing data from more than 50,000 peer-reviewed studies) and created the EBM questions and answers (EBMQAs). EBMQA comprises 105,222 Q and As, categorized by medical topics (eg, medical disciplines) and nonmedical topics (eg, question length), and classified into numerical or semantic types. We benchmarked a dataset of 24,000 Q and As on two state-of-the-art LLMs, GPT-4 (OpenAI) and Claude 3 Opus (Anthropic). We evaluated the LLM&#x2019;s accuracy on semantic and numerical question types and according to sublabeled topics. In addition, we examined the question-answering rate of LLMs by enabling them to choose to abstain from responding to questions. For validation, we compared the results for 100 unrelated numerical EBMQA questions between six human medical experts and the two language models.</p></sec><sec sec-type="results"><title>Results</title><p>In an analysis of 24,542 Q and As, Claude 3 and GPT-4 performed better on semantic Q and As (68.7%, n=1593 and 68.4%, n=1709), respectively. Then on numerical Q and As (61.3%, n=8583 and 56.7%, n=12,038), respectively, with Claude 3 outperforming GPT-4 in numeric accuracy (<italic>P</italic>&#x003C;.001). A median accuracy gap of 7% (IQR 5%&#x2010;10%) was observed between the best and worst sublabels per topic, with different LLMs excelling in different sublabels. Focusing on Medical Discipline sublabels, Claude 3 performed well in neoplastic disorders but struggled with genitourinary disorders (69%, n=676 vs 58%, n=464; <italic>P</italic>&#x003C;.0001), while GPT-4 excelled in cardiovascular disorders but struggled with neoplastic disorders (60%, n=1076 vs 53%, n=704; <italic>P</italic>=.0002). Furthermore, humans (82.3%, n=82.3) surpassed both Claude 3 (64.3%, n=64.3; <italic>P</italic>&#x003C;.001) and GPT-4 (55.8%, n=55.8; <italic>P</italic>&#x003C;.001) in the validation test. Spearman correlation between question-answering and accuracy rate in both Claude 3 and GPT-4 was insignificant (&#x03C1;=0.12, <italic>P</italic>=.69; &#x03C1;=0.43, <italic>P</italic>=.13).</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Both LLMs excelled more in semantic than numerical Q and As, with Claude 3 surpassing GPT-4 in numerical Q and As. However, both LLMs showed inter- and intramodel gaps in different medical aspects and remained inferior to humans. In addition, their ability to respond or abstain from answering a question does not reliably predict how accurately they perform when they do attempt to answer questions. Thus, their medical advice should be addressed carefully.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>questions and answers</kwd><kwd>dataset</kwd><kwd>evidence-based medicine</kwd><kwd>benchmark</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Clinical problem-solving requires the processing of data using the clinician&#x2019;s fund of knowledge in the form of illness scripts [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>], most of which is semantic (differentiating or opting entities). The statistical weight of relationships between data that define an illness is the numerical equivalent of medical knowledge that is essential for prioritizing diagnostic hypotheses and decision-making [<xref ref-type="bibr" rid="ref3">3</xref>].</p><p>Clinicians develop and use numerical knowledge through original research and leverage diagnostic support tools for more complex decision-making [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. However, the explosive amount of medical knowledge and complex health care systems is a tremendous challenge to high-quality, evidence-based medicine (EBM) [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>The breakthrough of large language models (LLMs), which process extensive data and encode knowledge from numerous online studies, shows great promise as tools for medical decision support [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. LLMs provide users with a sense of reliability and accuracy, but evidence shows that they occasionally generate responses that are not based on actual knowledge or give incorrect explanations [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. In addition, their performance on nontextual knowledge, such as medical codes, is limited [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Thus, physicians continue to express skepticism regarding LLMs and their capacity to outperform humans [<xref ref-type="bibr" rid="ref13">13</xref>].</p><p>Several benchmark studies have addressed this subject by focusing on lengthy questions from licensing examinations [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] or on datasets derived from medical abstracts that could only be answered with &#x201C;yes,&#x201D; &#x201C;no,&#x201D; or &#x201C;maybe&#x201D; [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>To create a dataset that consists solely of EBM knowledge and is flexible enough to generate both semantic and numeric questions and answers (Q and As), we used the Kahun knowledge graph&#x2014;a clinically validated artificial intelligence tool that uses a medical, evidence-based knowledge graph. We have developed a methodology to generate Q and As from this knowledge graph and created the EBM question and answer (EBMQA) dataset. The dataset comprises 105,000 short multiple-choice questions based on insights extracted from full-length studies and is aimed to test LLM&#x2019;s ability to assist physicians.</p><p>Finally, we benchmarked two state-of-the-art LLMs: OpenAI&#x2019;s GPT-4 [<xref ref-type="bibr" rid="ref16">16</xref>], and Antropic&#x2019;s Claude 3 Opus (Claude 3) [<xref ref-type="bibr" rid="ref17">17</xref>], using part of EBMQA. In addition, we compared their results to medical experts. Thus, we evaluated the performance of LLMs in both numerical and semantic Q and A, identified differences within and between LLMs across diverse medical and nonmedical domains, and compared their results to humans. These analyses allowed us to assess whether physicians can trust LLMs.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>EBMQA</title><sec id="s2-1-1"><title>Kahun</title><p>Kahun (developed by Kahun Medical Ltd) is a diagnostic tool based on artificial intelligence and structured knowledge graph technologies. The knowledge graph encompasses more than 50,000 peer-reviewed publications and more than 20,000,000 medical relations that were mapped by medical experts [<xref ref-type="bibr" rid="ref18">18</xref>]. Kahun&#x2019;s unique structure and its EBM content serve as a reasonable platform to generate the EBMQA. Since the data in EBMQA is based on Kahun&#x2019;s knowledge graph, which embraces the EBM approach, the gold standard for the answers in the EBMQA is based on published, peer-reviewed medical literature.</p></sec><sec id="s2-1-2"><title>Questions Structure</title><p>All Q and As were derived from Kahun&#x2019;s knowledge graph. Each question was generated based on data from nodes and edges in the graph and consisted of three main entities: source (usually a disorder related to the target), target (usually a symptom or sign related to the source), and background (usually a relevant population related to the source). In this study, we refer to source, target, or background as entities.</p><p>In addition, the relation between entities (derived from data on the edges) determines the question type and the specific template used to generate the questions and the answers. Further explanation regarding template creation is provided in the <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>EBMQA is comprised of two types of questions: (1) numeric Q and As&#x2014;derived from connections between a source and a single target. These questions deal with choosing the range in which the correct answer resides and are based solely on this statistical correlation (<xref ref-type="fig" rid="figure1">Figure 1</xref>) and (2) semantics Q and As&#x2014;derived from connections between a source and up to six targets (possible answers). These questions deal with choosing the most common targets related to a source, given a specific relation (eg, subtype, location, and duration), and therefore integrating statistical knowledge across multiple entities and distinguishing between those entities (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><p>Further examples of both numeric and semantic Q and As are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the study: from Kahun's knowledge graph, which references source, target, and background as edges of the graph (1-2), to the evidence-based medicine question and answer dataset and the large language model benchmarking (3-4), which includes both numeric and semantic questions and answers.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e64452_fig01.png"/></fig></sec><sec id="s2-1-3"><title>Multiple-Choice Question Structure</title><p>The questions in the EBMQA are multiple-choice. Numeric Q and As have one correct answer, while semantic Q and As have up to five correct answers. However, for questions in which one does not know the answer, an &#x201C;I do not know&#x201D; (IDK) option was added to all questions as a possible answer.</p></sec><sec id="s2-1-4"><title>Numerical Data and Possible Answers</title><p>Each Q and A is based on numerical data derived from Kahun&#x2019;s knowledge graph, including minimum, maximum, and midvalues, estimating the connections between medical entities. We used statistical methods, including median and median absolute deviation (MAD), to categorize answers into meaningful ranges based on their calculated midvalues. Specific methodologies for categorizing these ranges and detailed statistical information for each Q and A type are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-1-5"><title>Q and A Exclusion</title><p>EBMQA aims to provide concise medical Q and As. Therefore, as explained in the &#x201C;Questions Structure&#x201D; and &#x201C;Multiple-Choice Question Structure&#x201D; sections, questions involving multiple sources, backgrounds, or targets (except semantic questions) were excluded from the EBMQA. This exclusion ensures Q and As with one main, clearly defined subject and a defined target population. In addition, Q and As that are not related to medical knowledge (such as the average length of a season) were removed to maintain focus on medical information. Duplicate questions were excluded, and in such cases, the remaining question retained the average of all duplicated mean values. Therefore, each Q and A in the EBMQA is unique, ensuring no contradictions exist and eliminating any impact duplicates might have on further analysis. To prevent confusion, Q and As with answers indicating &#x201C;all answers are correct&#x201D; or &#x201C;none of the answers are correct&#x201D; were deleted from the dataset.</p></sec><sec id="s2-1-6"><title>Labeling</title><p>Each Q and A in the study was categorized using multiple medical data labels derived from standardized medical classifications such as those provided by Snomed CT [<xref ref-type="bibr" rid="ref19">19</xref>] and Kahun&#x2019;s medical expertise. These classifications include, but are not limited to, medical type, medical subject type, medical discipline, and prevalence. Each Q and A was also analyzed for its question length and distribution of answers. Details on the specific labeling criteria and categories are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec></sec><sec id="s2-2"><title>Benchmark Analysis: Q and A Selection and Subanalysis</title><p>Due to the relatively different structure of semantic Q and A and the limited number of Q and A, we analyzed all of them separately.</p><p>Regarding numeric Q and A types and in the search for meaningful parameters that might influence LLM&#x2019;s performance, the benchmark included Q and As based on three medical labels (medical subject type, medical discipline, and prevalence) and three nonmedical sublabels (Q and A types, question length, and answers distribution) as further detailed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. All Q and As were randomly selected, and although the total number of Q and As per label varied, each label contains an identical number of selected Q and As per sublabeled entity, with no repetition across selections.</p></sec><sec id="s2-3"><title>LLMs Prompting</title><p>In this study, we used two state-of-the-art LLMs: GPT-4 (gpt-4&#x2010;0125-preview) and Claude 3 Opus (claude-3-opus-20240229). Both models&#x2019; parameters included temperature=0 and maximum tokens=300. All queries were sent to each LLM using its respective application programming interface. The application programming interface calls were made using R (version 4.2.2; Posit Software, PBC) via R Studio. Further descriptions of the prompts and suitable examples are presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref></p></sec><sec id="s2-4"><title>Evaluating LLM&#x2019;s Performance</title><p>We evaluated LLM&#x2019;s performance using the following metrics: (1) accuracy&#x2014;for both semantic and numeric Q and As, the total number of correct answers suggested by the LLM divided by the total answers suggested by the LLM (excluding IDK answers); (2) answer rate (AR)&#x2014;for both semantic and numeric Q and As, the total number of both correct and wrong answers suggested by the LLM (excluding IDK answers) divided by the total answers suggested by the LLM (including IDK answers); and (3) majority&#x2014;for numeric Q and As only, the option that is selected as the correct answer most frequently among all given options in a questionnaire.</p></sec><sec id="s2-5"><title>Prompt Sensitivity Analysis</title><p>To test both the effects of adding IDK as a possible answer and changing the order of answers (including IDK), 8 different prompts were tested on 100 randomly selected numerical questions. Four prompts included IDK with a different order of possible answers, while four excluded IDK. To prevent bias in the selection process across specific question types, difficulty levels, medical disciplines, and question lengths, and to accurately represent the proportion of each question type in the EBMQA dataset, we randomly selected the questions included in the questionnaire.</p></sec><sec id="s2-6"><title>Human Validation</title><p>To validate the Q and As in the EBMQA, 2 physicians and 4 clinical-year medical students (3 females and 3 males; aged between 28 and 35 years; all educated and licensed in Israel) answered the questionnaire. Each completed it first with the IDK option and then with mandatory guessing on their previous IDK responses. Their accuracies, with and without guessing, were compared to LLM&#x2019;s performance.</p></sec><sec id="s2-7"><title>Analysis and Variables</title><p>All statistical analyses were performed using R Studio (R version 4.2.2). Categorical variables were represented as percentages, while continuous variables were represented as means and SDs for normally distributed data, or medians and IQRs otherwise. The cutoff for statistically significant results was set at <italic>&#x03B1;</italic>=.05, and 95% CIs were calculated. Proportions comparison was conducted using the &#x201C;Proportion test.&#x201D; Spearman correlation was used to analyze correlations between 2 quantitative variables.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This study was approved by the Tel Aviv University Ethics Committee (institutional review board protocol number 0008527&#x2010;2). All questionnaire data were anonymized and deidentified in accordance with HIPAA (Health Insurance Portability and Accountability Act) Safe Harbor privacy rules. Informed consent was received by all the participants. Before answering the questionnaire, participants were informed that the study was being conducted for research purposes only, no personal or sensitive data would be collected, answers would not be identified in the results, participation would be voluntary, and informed consent would be provided by answering the questionnaire. Appropriate measures were taken to ensure compliance with relevant privacy guidelines. No compensation was provided to participants.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>EBMQA</title><p>The EBMQA contains 105,222 Q and As. In addition, each Q and A pair was labeled according to metadata labels and medical labels.</p><sec id="s3-1-1"><title>Medical Labels</title><p>The EBMQA encompasses diverse medical data types, including a unique count of 7746 &#x201C;Disorders,&#x201D; 2547 &#x201C;Signs or Symptoms,&#x201D; 1243 &#x201C;Lab tests,&#x201D; 885 &#x201C;Imaging or procedures,&#x201D; 474 &#x201C;Background&#x201D; data (demographics, habits, family history, etc), and more (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>Among the medical subject types, &#x201C;Disorders&#x201D; was the most abundant with 45,964 Q and As, followed by &#x201C;Symptoms and Signs&#x201D; with 30,152 Q and As, &#x201C;Lab test&#x201D; with 5966 Q and As, and &#x201C;Imaging or Procedures&#x201D; with 4374 Q and As. All the other subjects encompass 640 Q and As (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>Focusing on medical discipline, the EBMQA contains 64,846 relevant Q and As: the leading medical discipline was the digestive system with 9879 Q and As, followed by the cardiovascular system with 7847 Q and As, and infectious diseases with 7798 Q and As. The musculoskeletal system had the least number of Q and As, that is, 2832 (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p>Regarding the &#x201C;Prevalence&#x201D; label, the median prevalence was 1e-4 (IQR 2e-6 to 1.98e-4) and the MAD was 9.810102e-05 (IQR 1.9e-6 to 1.98e-4). Of these, 36,653 Q and As focused on high-prevalence disorders, 22,139 Q and As focused on moderate-prevalence disorders, and 2531 Q and As focused on low-prevalence disorders (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p></sec><sec id="s3-1-2"><title>Metadata Labels</title><p>EBMQA includes 13 distinct Q and A types (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). The most frequent Q and A type was &#x201C;Sensitivity&#x201D; with 70% (74,140/105,222) of the total Q and As. Eight Q and A types had less than 900 Q and As: specificity, positive likelihood ratio, negative likelihood ratio, relative risk, prevalence, positive predictive value, negative predictive value, and associated risk.</p><p>In total, the median number of words per question (including the question, instructions, and possible answers) in the EBMQA was 57 (IQR 53&#x2010;66), with a MAD of 5. The medium-length question group had the majority of Q and As (ie, 59,998), whereas the short-length question group had the fewest Q and As (ie, 9968) (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>). Focusing on each Q and A type, &#x201C;Risk Factor&#x201D; Q and As had the longest median of question length with 81 (IQR 80&#x2010;84) words, while &#x201C;Sensitivity&#x201D; Q and As had the shortest with 54 (IQR 52&#x2010;58) words (<xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>).</p><p>Regarding numeric questions with 3 range values, the most frequently distributed answer was the midrange values (46,431 Q and As), followed by the low-range values (28,598 Q and As), and the high-range values (14,292 Q and As; <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>).</p></sec></sec><sec id="s3-2"><title>Benchmark Analysis</title><p>Of the 105,222 Q and As, a set of 24,542 questions was presented to each LLM. &#x201C;Numeric&#x201D; Q and As comprised 90% (22,000/24,542) of the questions, whereas &#x201C;semantic&#x201D; Q and As accounted for the remaining 10% (2542/24,254).</p><p>Both LLMs demonstrated better performances in the semantic Q and As than in the numeric Q and As in terms of accuracy (Claude 3: 68.65%, 1592.78/2320, vs 61.29%, 8583/14,005, <italic>P</italic>&#x003C;.001; GPT-4: 68.38%, 1708.85/2499, vs 56.74%, 12,038/21,215, <italic>P</italic>&#x003C;.001) and AR (Claude 3: 94.62%, 2320/2542, vs 63.66%, 14,005/22,000, <italic>P</italic>&#x003C;.001; GPT-4: 98.31%, 2499/2542, vs 96.4%, 21,215/22,000, <italic>P</italic>&#x003C;.001).</p><p>From an intermodel perspective, Claude 3 outperformed GPT-4 in numeric accuracy, though no significant difference was found in semantic accuracy. However, in comparison to Claude 3, GPT-4 had a higher AR in both semantic and numeric questions (<xref ref-type="table" rid="table1">Table 1</xref>). Focusing on numeric accuracy and excluding any questions that one or both LLMs responded to with IDK results in the exclusion of 8133 questions and a total of 13,867 answered questions. Keeping the same trend, Claude 3 outperformed GPT-4 in numeric accuracy (8491/13,867 vs 8255/13,867; <italic>P</italic>=.004).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Claude 3 versus GPT-4: overall accuracy and answer rate for semantic and numeric questions.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question type and model</td><td align="left" valign="bottom">Accuracy, % (n/N)</td><td align="left" valign="bottom">Answer rate, % (n/N)</td><td align="left" valign="bottom">Proportion test, <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="3">Semantic</td><td align="left" valign="top">.86</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude 3 (Anthropic)</td><td align="char" char="." valign="top">68.65% (1592.78/2320)</td><td align="char" char="." valign="top">94.62% (2320/2542)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>GPT-4 (OpenAI)</td><td align="char" char="." valign="top">68.38% (1708.85/2499)</td><td align="char" char="." valign="top">98.31% (2499/2542)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="3"><named-content content-type="indent">Numeric</named-content></td><td align="left" valign="top">&#x003C;.00001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Claude 3</td><td align="left" valign="top">61.29% (8583/14,005)</td><td align="left" valign="top">56.74% (12,038/21,215)</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top">GPT-4</td><td align="left" valign="top">63.66% (14,005/22,000)</td><td align="left" valign="top">96.4% (21,215/22,000)</td><td align="left" valign="top"/></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Prompt Sensitivity Analysis</title><p>Based on the results of the questionnaire, the average accuracy of Claude 3 with the IDK option versus without it was not significantly different (64.25%; mean 64.25, SD 3.95, vs 59.25%; mean 59.25, SD 5; <italic>P=.</italic>17). A similar trend was noted for GPT-4 (55.75%; mean 55.75, SD 1.71, vs 53.25%; mean 53.25, SD 2.89; <italic>P=</italic>.24). In addition, within each subgroup&#x2014;Claude 3 with and without the IDK option, and GPT-4 with and without the IDK option&#x2014;no single answer-option-order prompt was significantly superior to the others (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendices 7</xref> and <xref ref-type="supplementary-material" rid="app8">8</xref>).</p></sec><sec id="s3-4"><title>Human Validation</title><p>Claude 3 and GPT-4 achieved higher average accuracy rates, with or without the IDK option, than random guessing (33%, n=33) or majority guessing (47%, n=47). However, both models had lower average accuracy rates compared to humans with the IDK option (82.3%; mean 82.3, SD 2.82) or without it (78.2%; mean 78.2, SD 3.6; <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>).</p></sec><sec id="s3-5"><title>Numeric Q and A Subanalysis</title><p>The accuracy gap between the highest and lowest accuracy rates in each LLM was calculated, revealing a median difference of 7% (IQR 5%&#x2010;10%; <xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>). Focusing on disorders selected sublabels, Claude 3 performed well in neoplastic disorders but struggled with genitourinary disorders (69%, 676/984 vs 58%, 464/803; <italic>P&#x003C;</italic>.0001), while GPT-4 excelled in cardiovascular disorders but struggled with neoplastic disorders (60%, 1076/1783 vs 53%, 704/1316; <italic>P=</italic>.0002; <xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>). Furthermore, among sublabel disorders queried over 200 times, Spearman correlations between Q and A and accuracy rate in both Claude 3 and GPT-4 were insignificant (<italic>&#x03C1;=0.12, P=</italic>.69<italic>; &#x03C1;=0.43, P=</italic>.13).</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study aimed to highlight the current gaps in the medical knowledge of LLMs and their current ability to surpass humans. We presented a method to create an EBMQA from a structured knowledge graph and benchmarked two state-of-the-art LLMs (GPT-4 and Claude 3) [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. We demonstrated that both LLMs performed better in semantic Q and As than in numerical Q and As by asking more than 24,000 Q and As (<xref ref-type="table" rid="table1">Table 1</xref>). Claude 3 outperformed GPT-4 in numerical Q and As and showed similar results in semantic Q and As, although it exhibited significantly lower ARs (<xref ref-type="table" rid="table1">Table 1</xref>). A validation test indicated that the numerical accuracy rates of Claude 3 and GPT-4 were higher than majority guessing but remained lower than those of medical experts (<xref ref-type="supplementary-material" rid="app12">Multimedia Appendix 12</xref>).</p></sec><sec id="s4-2"><title>Prior Work and Novel Contribution</title><p>The use of knowledge graphs for evaluating LLMs is gaining popularity [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>]. Kahun&#x2019;s structured knowledge graph enabled us to generate both semantic and numeric labeled Q and A pairs, without using advanced models [<xref ref-type="bibr" rid="ref22">22</xref>]. Our Q and A generation process, which relies on templates designed to fit a source-target-background graph structure, can apply to other graphs with a similar structure. In addition, this relatively large knowledge graph allowed us to create a massive EBM dataset. Moreover, we embraced a data-driven approach in which distractors were based on subanalysis distribution rather than specific or random values.</p><p>The EBMQA, which consists of 105,222 straightforward single-line Q and As, was designed to mimic physicians&#x2019; strategy of breaking complex medical scenarios into less complicated problems, unlike medical licensing examination datasets, which are typically complex-case oriented [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. In addition, the EBMQA addresses numeric and semantic data, which is considered fundamental for physicians [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>], while dealing with data from studies and embracing the EBM approach [<xref ref-type="bibr" rid="ref7">7</xref>], as opposed to the abstracted-based yes or no or maybe Q and As in PubMedQA [<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>A major concern regarding applying LLMs in health care is the uncertainty of providing solid evidence that supports their answers [<xref ref-type="bibr" rid="ref8">8</xref>]. Clinical evidence predominantly relies on statistical and numerical data. Thus, it is imperative to examine whether LLMs can deliver this type of reasoning. It has been shown that LLMs are more capable when given semantic questions rather than numerical questions, though in a relatively small sample size (smaller than 200 Q and As) [<xref ref-type="bibr" rid="ref26">26</xref>]. As far as we know, we were the first to show this trend in the medical field while using a much larger scale (<xref ref-type="table" rid="table1">Table 1</xref>). Furthermore, since both semantic and numeric questions in the EBMQA may address the same entities but from different perspectives, our study questioned whether LLMs can support their semantic answers with statistical data.</p><p>In addition, as LLMs are gaining more popularity as decision-support tools [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], understanding which types of questions will yield more precise answers, as demonstrated in our semantic and numeric analysis, could benefit not only the medical community but also the general use of LLMs.</p><p>A recent benchmark analysis, focused on nephrology Q and As, only found that GPT-4 outperformed Claude 2 [<xref ref-type="bibr" rid="ref27">27</xref>]. Although our intermodel examination did not include a direct nephrology compression due to a different classification method, it reveals that generally Claude 3 outperformed GPT-4, and specifically in a variety of medical disciplines such as neoplastic disorders, nervous system, and more. Our results raise the need to constantly benchmark new LLMs as they continuously improve.</p><p>Regarding internal model variations, the differences in accuracy between the highest and lowest performing medical disciplines, 8% for Claude 3 and 6% for GPT-4, support previous benchmarks that found LLM performance can vary across different medical disciplines [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>].</p><p>Moreover, this comprehensive benchmark widens the medical scope and further supports both intra- and inter-model differences by exploring medical subjects: Claude 3 favors &#x201C;Imaging and Procedures&#x201D; and struggles with &#x201C;Disorders&#x201D; (64%, 463/719 vs 60%, 3181/5296; <italic>P</italic>=.03, respectively), while GPT-4 excels in &#x201C;Imaging and Procedures&#x201D; but struggles with &#x201C;Lab tests&#x201D; (60%, 1017/1683 vs 53%, 1008/1886; <italic>P</italic>&#x003C;.0001, respectively). Thus, our study underscores two vital, yet distinct, aspects in the integration of LLMs into daily medical practice: first, the specific areas of medical expertise where each LLM excels, and second, comparing which LLM is superior within each area of medical expertise.</p><p>As suggested by previous studies, LLMs are highly sensitive to question wording, structure, and subject matter. Consequently, direct comparisons across different benchmarks, which rely on distinct datasets, may yield varying scores that do not necessarily reflect a genuine knowledge gap but rather other confounding factors such as those mentioned at the start of this section [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. For example, Katz et al [<xref ref-type="bibr" rid="ref28">28</xref>] reported that GPT-4&#x2019;s accuracy rates ranged from 17.42% (n=21) to 74.7% (n=90) across various medical disciplines. In contrast, our study found GPT-4&#x2019;s accuracy rates ranged more narrowly, from 53.5% (n=704) to 60.35% (n=1076). This discrepancy could be partially attributed to the differing medical disciplines emphasized in each study, as well as variations in question structure. While Katz et al [<xref ref-type="bibr" rid="ref28">28</xref>] used five different exams with potentially diverse question formats, all questions in our study were generated using the same templates, resulting in a relatively narrower accuracy range. Furthermore, Liu et al [<xref ref-type="bibr" rid="ref31">31</xref>] benchmarked GPT-4 and Claude 3 using the Japanese National Medical Examination and reported accuracy rates of 80.0% (n=720) and 83.6% (n=752), respectively. Although our study observed a similar trend, the accuracy rates differed, with 56.74% (n=12,038) for GPT-4 and 61.29% (n=8583) for Claude 3. A key distinction between the 2 benchmark studies lies in the question structure: Liu et al&#x2019;s dataset included questions with multiple correct answers, whereas our numerical questions had only a single correct answer. These examples highlight both the importance of treating direct comparisons among LLM benchmark studies with caution and the value of developing multiple high-quality benchmarks on unique, well-designed datasets.</p></sec><sec id="s4-3"><title>Clinical Impact and Further Needed Research</title><p>As the debate over whether models surpass humans persists [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], the outcomes of our validation tests suggest that humans still excel in certain medical tasks. Therefore, we support further evaluations of LLMs before using them in medical settings.</p><p>Furthermore, the insignificant correlation between accuracy and AR contradicts the theory that a model&#x2019;s confidence in its response reflects its subject expertise [<xref ref-type="bibr" rid="ref32">32</xref>]. Thus, abstaining from providing an answer failed to explain the intramodel variance results, specifically across medical disciplines. Notably, recent research has shown that LLMs exhibit varied abstention abilities, which is consistent with our finding and may be influenced by model-specific characteristics, context nature, and question type [<xref ref-type="bibr" rid="ref33">33</xref>]. For instance, some LLMs find it challenging to abstain from Boolean questions with standard prompts. Intriguingly, modifying the context by introducing irrelevant information can occasionally enhance abstention performance and, thereby, improve overall task accuracy [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>This evidence, along with our findings, raises concerns that without prior knowledge of both the medical field and the model, the trustworthiness of LLMs is questionable.</p><p>In terms of prompt engineering, our sensitivity analysis showed relatively small SDs in prompt accuracy, which supported our prompt stability. In addition, although insignificant, the IDK prompt yielded higher accuracy and was therefore used. Moreover, changing the order of the distractors did not significantly affect the LLM&#x2019;s performance.</p></sec><sec id="s4-4"><title>Limitations</title><p>Our benchmark has several limitations. First, although medically tuned LLMs have shown promising results [<xref ref-type="bibr" rid="ref34">34</xref>], they are not publicly available and hence were not included in this study. We highly recommend conducting a similar benchmark that includes these LLMs. Second, we did not use additional context for the prompt or use external methods such as retrieval-augmented generation, which could potentially improve the results. We chose not to use these methods because we believe that, currently, physicians are asking LLMs straightforward questions. In addition, some of these external methods are not widely accessible to end users and are far more complex than the typical daily use of LLMs that we aimed to replicate. Given that these methods might influence the results, we strongly recommend conducting research that focuses on retrieval-augmented generation or providing extra context in the prompt. Third, the study was designed so that the models and human participants would only choose one suggested answer, without providing additional information or feedback. Therefore, we support further studies to examine these responses by the models, while considering feedback from human physicians. Fourth, this study did not include a subanalysis regarding progressive patterns such as the abstention behavior. Therefore, we recommend further research on the subject. Another limitation is the known potential biases when using LLMs, such as the training on an enormous amount of data, which may include bias and inaccuracies itself, or may also harm the contextual understanding of medical cases and result in poorer answers due to undertraining on less common medical disciplines [<xref ref-type="bibr" rid="ref35">35</xref>].</p></sec><sec id="s4-5"><title>Conclusions</title><p>On the EBMQA dataset, which resembles physicians&#x2019; problem-solving approach, LLMs were better at solving semantic than numeric questions. Despite Claude 3 surpassing GPT-4, both LLMs exhibited inter- and intramodel gaps in medical knowledge. In addition, human participants outperformed both LLMs on numeric questions. These results suggest that LLMs&#x2019; responses, especially numeric ones, should be considered cautiously in clinical settings.</p></sec></sec></body><back><ack><p>No AI tool was used to write this manuscript. This study was funded by Kahun Medical Ltd.</p></ack><notes><sec><title>Data Availability</title><p>The datasets generated or analyzed during this study are available from the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>EA, ML, JM, and MTZ contributed to conceptualization. EA, SO, ML, and JM contributed to data curation and software development. EA, ML, DBJ, and EY contributed to methodology and project administration. DBJ, SL, DH, DE, YD, SB, ES, EY, and DE contributed to validation. EA contributed to writing the original draft. EA and DBJ contributed to reviewing and editing the paper. REA, NS, and VS provided supervision and guidance.</p></fn><fn fn-type="conflict"><p>The authors EA, ML, DH, DBJ, MTK, DE, SL, YD, SB, JM, and SO are paid employees by Kahun Ltd. All other authors declare no financial or non-financial competing interests.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AR</term><def><p>answer rate</p></def></def-item><def-item><term id="abb2">EBM</term><def><p>evidence-based medicine</p></def></def-item><def-item><term id="abb3">EBMQA</term><def><p>evidence-based medicine question and answer</p></def></def-item><def-item><term id="abb4">HIPAA</term><def><p>Health Insurance Portability and Accountability Act</p></def></def-item><def-item><term id="abb5">IDK</term><def><p>I do not know</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">MAD</term><def><p>median absolute deviation</p></def></def-item><def-item><term id="abb8">Q and A</term><def><p>question and answer</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Custers</surname><given-names>E</given-names> </name></person-group><article-title>Thirty years of illness scripts: theoretical origins and practical applications</article-title><source>Med Teach</source><year>2015</year><month>05</month><volume>37</volume><issue>5</issue><fpage>457</fpage><lpage>462</lpage><pub-id pub-id-type="doi">10.3109/0142159X.2014.956052</pub-id><pub-id pub-id-type="medline">25180878</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bowen</surname><given-names>JL</given-names> </name></person-group><article-title>Educational strategies to promote clinical diagnostic reasoning</article-title><source>N Engl J Med</source><year>2006</year><month>11</month><day>23</day><volume>355</volume><issue>21</issue><fpage>2217</fpage><lpage>2225</lpage><pub-id pub-id-type="doi">10.1056/NEJMra054782</pub-id><pub-id pub-id-type="medline">17124019</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McGee</surname><given-names>S</given-names> </name></person-group><article-title>Simplifying likelihood ratios</article-title><source>J Gen Intern Med</source><year>2002</year><month>08</month><volume>17</volume><issue>8</issue><fpage>646</fpage><lpage>649</lpage><pub-id pub-id-type="doi">10.1046/j.1525-1497.2002.10750.x</pub-id><pub-id pub-id-type="medline">12213147</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cullen</surname><given-names>RJ</given-names> </name></person-group><article-title>In search of evidence: family practitioners&#x2019; use of the Internet for clinical information</article-title><source>J Med Libr Assoc</source><year>2002</year><month>10</month><volume>90</volume><issue>4</issue><fpage>370</fpage><lpage>379</lpage><pub-id pub-id-type="medline">12398243</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fourcade</surname><given-names>A</given-names> </name><name name-style="western"><surname>Khonsari</surname><given-names>RH</given-names> </name></person-group><article-title>Deep learning in medical image analysis: a third eye for doctors</article-title><source>J Stomatol Oral Maxillofac Surg</source><year>2019</year><month>09</month><volume>120</volume><issue>4</issue><fpage>279</fpage><lpage>288</lpage><pub-id pub-id-type="doi">10.1016/j.jormas.2019.06.002</pub-id><pub-id pub-id-type="medline">31254638</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sackett</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Rosenberg</surname><given-names>WM</given-names> </name><name name-style="western"><surname>Gray</surname><given-names>JA</given-names> </name><etal/></person-group><article-title>Evidence based medicine: what it is and what it isn&#x2019;t</article-title><source>BMJ</source><year>1996</year><month>01</month><day>13</day><volume>312</volume><issue>7023</issue><fpage>71</fpage><lpage>72</lpage><pub-id pub-id-type="doi">10.1136/bmj.312.7023.71</pub-id><pub-id pub-id-type="medline">8555924</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bastian</surname><given-names>H</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chalmers</surname><given-names>I</given-names> </name></person-group><article-title>Seventy-five trials and eleven systematic reviews a day: how will we ever keep up?</article-title><source>PLOS Med</source><year>2010</year><month>09</month><day>21</day><volume>7</volume><issue>9</issue><fpage>e1000326</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1000326</pub-id><pub-id pub-id-type="medline">20877712</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehandru</surname><given-names>N</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Almaraz</surname><given-names>ER</given-names> </name><etal/></person-group><article-title>Evaluating large language models as agents in the clinic</article-title><source>NPJ Digit Med</source><year>2024</year><month>04</month><day>3</day><volume>7</volume><issue>1</issue><fpage>84</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01083-y</pub-id><pub-id pub-id-type="medline">38570554</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Eggmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Weiger</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zitzmann</surname><given-names>NU</given-names> </name><name name-style="western"><surname>Blatz</surname><given-names>MB</given-names> </name></person-group><article-title>Implications of large language models such as ChatGPT for dental medicine</article-title><source>J Esthet Restor Dent</source><year>2023</year><month>10</month><volume>35</volume><issue>7</issue><fpage>1098</fpage><lpage>1102</lpage><pub-id pub-id-type="doi">10.1111/jerd.13046</pub-id><pub-id pub-id-type="medline">37017291</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hua</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Benchmarking large language models on cmexam -- a comprehensive chinese medical exam dataset</article-title><source>Arxiv</source><access-date>2025-07-10</access-date><comment>Preprint posted online on  Jun 5, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2306.03030">http://arxiv.org/abs/2306.03030</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.03030</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Soroush</surname><given-names>A</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Zimlichman</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Large language models are poor medical coders &#x2014; benchmarking of medical code querying</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>25</day><volume>1</volume><issue>5</issue><pub-id pub-id-type="doi">10.1056/AIdbp2300040</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sezgin</surname><given-names>E</given-names> </name></person-group><article-title>Artificial intelligence in healthcare: complementing, not replacing, doctors and healthcare providers</article-title><source>Digit Health</source><year>2023</year><volume>9</volume><fpage>20552076231186520</fpage><pub-id pub-id-type="doi">10.1177/20552076231186520</pub-id><pub-id pub-id-type="medline">37426593</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Pal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Umapathi</surname><given-names>LK</given-names> </name><name name-style="western"><surname>Sankarasubbu</surname><given-names>M</given-names> </name></person-group><article-title>MedMCQA: a large-scale multi-subject multi-choice dataset for medical domain question answering</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 22, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2203.14371</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Dhingra</surname><given-names>B</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>PubMedQA: a dataset for biomedical research question answering</article-title><source>arXiv</source><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/1909.06146">http://arxiv.org/abs/1909.06146</ext-link></comment></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>GPT-4</article-title><source>OpenAI</source><access-date>2025-06-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/research/gpt-4">https://openai.com/research/gpt-4</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Introducing the next generation of claude</article-title><source>Anthropic</source><year>2024</year><access-date>2025-06-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.anthropic.com/news/claude-3-family">https://www.anthropic.com/news/claude-3-family</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Avnat</surname><given-names>E</given-names> </name><name name-style="western"><surname>Samin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ben Joya</surname><given-names>D</given-names> </name><etal/></person-group><article-title>The potential of evidence-based clinical intake tools to discover or ground prevalence of symptoms using real-life digital health encounters: retrospective cohort study</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>16</day><volume>26</volume><fpage>e49570</fpage><pub-id pub-id-type="doi">10.2196/49570</pub-id><pub-id pub-id-type="medline">39012659</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>SNOMED CT</article-title><source>National Library of Medicine</source><year>2024</year><access-date>2025-06-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nlm.nih.gov/healthit/snomedct/index.html">https://www.nlm.nih.gov/healthit/snomedct/index.html</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>K</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>YE</given-names> </name><name name-style="western"><surname>Zha</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Head-to-tail: how knowledgeable are large language models (llms)? a.k.a. will llms replace knowledge graphs?</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 20, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2308.10168">http://arxiv.org/abs/2308.10168</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Abu-Rasheed</surname><given-names>H</given-names> </name><name name-style="western"><surname>Weber</surname><given-names>C</given-names> </name><name name-style="western"><surname>Fathi</surname><given-names>M</given-names> </name></person-group><article-title>Knowledge graphs as context sources for LLM-based explanations of learning recommendations</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 5, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2403.03008">http://arxiv.org/abs/2403.03008</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mulla</surname><given-names>N</given-names> </name><name name-style="western"><surname>Gharpure</surname><given-names>P</given-names> </name></person-group><article-title>Automatic question generation: a review of methodologies, datasets, evaluation metrics, and applications</article-title><source>Prog Artif Intell</source><year>2023</year><month>03</month><volume>12</volume><issue>1</issue><fpage>1</fpage><lpage>32</lpage><pub-id pub-id-type="doi">10.1007/s13748-023-00295-9</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>E</given-names> </name><name name-style="western"><surname>Oufattole</surname><given-names>N</given-names> </name><etal/></person-group><article-title>What disease does this patient have? a large-scale open domain question answering dataset from medical exams</article-title><source>arXiv</source><access-date>2025-07-10</access-date><comment>Preprint posted online on  Sep 28, 2020</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2009.13081">http://arxiv.org/abs/2009.13081</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2009.13081</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bierer</surname><given-names>BE</given-names> </name><name name-style="western"><surname>Baedorf Kassis</surname><given-names>S</given-names> </name></person-group><article-title>Communicating complex numeric information in clinical research</article-title><source>Front Commun</source><year>2023</year><volume>8</volume><pub-id pub-id-type="doi">10.3389/fcomm.2023.1096271</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gansel</surname><given-names>X</given-names> </name><name name-style="western"><surname>Mary</surname><given-names>M</given-names> </name><name name-style="western"><surname>van Belkum</surname><given-names>A</given-names> </name></person-group><article-title>Semantic data interoperability, digital medicine, and e-health in infectious disease management: a review</article-title><source>Eur J Clin Microbiol Infect Dis</source><year>2019</year><month>06</month><volume>38</volume><issue>6</issue><fpage>1023</fpage><lpage>1034</lpage><pub-id pub-id-type="doi">10.1007/s10096-019-03501-6</pub-id><pub-id pub-id-type="medline">30771124</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Rasool</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Kurniawan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Balugo</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Evaluating llms on document-based QA: exact answer selection and numerical extraction using cogtale dataset</article-title><source>arXiv</source><access-date>2025-07-10</access-date><comment>Preprint posted online on  Nov 11, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2311.07878">http://arxiv.org/abs/2311.07878</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.07878</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Koo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Blum</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Benchmarking open-source large language models, GPT-4 and Claude 2 on multiple-choice questions in Nephrology</article-title><source>NEJM AI</source><year>2024</year><month>01</month><day>25</day><volume>1</volume><issue>2</issue><fpage>AIdbp2300092</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300092</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Katz</surname><given-names>U</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>E</given-names> </name><name name-style="western"><surname>Shachar</surname><given-names>E</given-names> </name><etal/></person-group><article-title>GPT versus resident physicians &#x2014; a benchmark based on official board scores</article-title><source>NEJM AI</source><year>2024</year><month>04</month><day>25</day><volume>1</volume><issue>5</issue><fpage>AIdbp2300192</fpage><pub-id pub-id-type="doi">10.1056/AIdbp2300192</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Pezeshkpour</surname><given-names>P</given-names> </name><name name-style="western"><surname>Hruschka</surname><given-names>E</given-names> </name></person-group><article-title>Large language models sensitivity to the order of options in multiple-choice questions</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 22, 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2308.11483">https://arxiv.org/abs/2308.11483</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>W</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><name name-style="western"><surname>Xiang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Can multiple-choice questions really be useful in detecting the abilities of llms?</article-title><source>arXiv</source><access-date>2025-07-10</access-date><comment>Preprint posted online on  Mar 26, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2403.17752">https://arxiv.org/abs/2403.17752</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2403.17752</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Okuhara</surname><given-names>T</given-names> </name><name name-style="western"><surname>Dai</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Evaluating the effectiveness of advanced large language models in medical Knowledge: a comparative study using Japanese national medical examination</article-title><source>Int J Med Inform</source><year>2025</year><month>01</month><volume>193</volume><fpage>105673</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105673</pub-id><pub-id pub-id-type="medline">39471700</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Kadavath</surname><given-names>S</given-names> </name><name name-style="western"><surname>Conerly</surname><given-names>T</given-names> </name><name name-style="western"><surname>Askell</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Language models (mostly) know what they know</article-title><source>arXiv</source><access-date>2025-07-10</access-date><comment>Preprint posted online on  Jul 11, 2022</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2207.05221">https://arxiv.org/abs/2207.05221</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2207.05221</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wen</surname><given-names>B</given-names> </name><name name-style="western"><surname>Howe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>LL</given-names> </name></person-group><article-title>Characterizing LLM abstention behavior in science QA with context perturbations</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 18, 2024</comment><comment><ext-link ext-link-type="uri" xlink:href="https://arxiv.org/abs/2404.12452">https://arxiv.org/abs/2404.12452</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id><pub-id pub-id-type="medline">39779926</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ullah</surname><given-names>E</given-names> </name><name name-style="western"><surname>Parwani</surname><given-names>A</given-names> </name><name name-style="western"><surname>Baig</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>R</given-names> </name></person-group><article-title>Challenges and barriers of using large language models (LLM) such as ChatGPT for diagnostic medicine with a focus on digital pathology - a recent scoping review</article-title><source>Diagn Pathol</source><year>2024</year><month>02</month><day>27</day><volume>19</volume><issue>1</issue><fpage>43</fpage><pub-id pub-id-type="doi">10.1186/s13000-024-01464-7</pub-id><pub-id pub-id-type="medline">38414074</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional data and explanations.</p><media xlink:href="jmir_v27i1e64452_app1.docx" xlink:title="DOCX File, 4314 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Examples for semantic and numeric questions and answers.</p><media xlink:href="jmir_v27i1e64452_app2.docx" xlink:title="DOCX File, 2486 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Numeric question and answers benchmark subanalysis according to medical and nonmedical labels and sublabels.</p><media xlink:href="jmir_v27i1e64452_app3.png" xlink:title="PNG File, 574 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Distribution of the data and labels in the evidence-based medicine questions and answers: (A) unique medical data type, (B) question subject, (C) medical discipline, (D) disorders prevalence, (E) question type, and (F) question length.</p><media xlink:href="jmir_v27i1e64452_app4.png" xlink:title="PNG File, 888 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Distributions of answers according to question type.</p><media xlink:href="jmir_v27i1e64452_app5.docx" xlink:title="DOCX File, 2487 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Distribution of the correct answer with midvalues ranging from 0 to 1 of the questions and answers in the evidence-based medicine question and answer dataset, categorized by the overall median value of each question and answer type and the corresponding median absolute deviation: 0&#x2264; midvalue &#x003C; overall median - median absolute deviation (short), overall median - median absolute deviation &#x2264; mid value &#x2264; overall median + median absolute deviation (medium), and overall median + median absolute deviation &#x2264; mid value &#x003C;1 (long).</p><media xlink:href="jmir_v27i1e64452_app6.png" xlink:title="PNG File, 57 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Sensitivity analysis of four prompts with the &#x201C;I do not know&#x201D; option was assessed according to their accuracy. Each row represents a different order of the possible answers. The order of the possible answers in the prompt is based on the sequence of letters or symbols, separated by hyphens, from left to right. Each letter or symbol represents a frequency range determined by the relevant overall median and the median absolute deviation: frequency range &#x2265; overall median + median absolute deviation (frequent [F]), overall median - median absolute deviation &#x2264; frequency range &#x2264; overall median + median absolute deviation (medium [M]), frequency range &#x2264; overall median - median absolute deviation (rare [R]), and I do not know.</p><media xlink:href="jmir_v27i1e64452_app7.png" xlink:title="PNG File, 15 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Sensitivity analysis of four prompts without the &#x201C;I do not know&#x201D; option was assessed according to their accuracy. Each row represents a different order of the possible answers. The order of the possible answers in the prompt is based on the sequence of letters or symbols, separated by hyphens, from left to right. Each letter or symbol represents a frequency range determined by the relevant overall median and the median absolute deviation: frequency range &#x2265; overall median + median absolute deviation (frequent [F]), overall median - median absolute deviation &#x2264; frequency range &#x2264; overall median + median absolute deviation (medium [M]), frequency range &#x2264; overall median - median absolute deviation (rare [R]).</p><media xlink:href="jmir_v27i1e64452_app8.png" xlink:title="PNG File, 14 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Human and prompt validation.</p><media xlink:href="jmir_v27i1e64452_app9.docx" xlink:title="DOCX File, 2833 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Numeric question and answer accuracy rate sublabel analysis: (A) answer distribution, (B) medical discipline, (C) medical subject type, (D) question and answer type, (E) disorder prevalence, and (F) question length. Red asterisks represent proportion <italic>P</italic> values: .05&#x003C; *&#x003C;.01, ***&#x003C;.0001.</p><media xlink:href="jmir_v27i1e64452_app10.png" xlink:title="PNG File, 284 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Proportion comparison according to sublabels.</p><media xlink:href="jmir_v27i1e64452_app11.docx" xlink:title="DOCX File, 2491 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Validation test: each large language model was tested 8 times&#x2014;4 times with the &#x201C;I do not know&#x201D; (abstain) option, using the same prompt but in different order of possible answers, and 4 times without the abstain option. In addition, 6 medical experts were tested: first with the abstain option, and then without. Error bars indicating 1 SD and answer rate bars were added only for trials with the abstain option.</p><media xlink:href="jmir_v27i1e64452_app12.png" xlink:title="PNG File, 61 KB"/></supplementary-material></app-group></back></article>