<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e70901</article-id><article-id pub-id-type="doi">10.2196/70901</article-id><article-categories><subj-group subj-group-type="heading"><subject>Viewpoint</subject></subj-group></article-categories><title-group><article-title>Beyond Benchmarks: Evaluating Generalist Medical Artificial Intelligence With Psychometrics</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Sun</surname><given-names>Luning</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gibbons</surname><given-names>Christopher</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hern&#x00E1;ndez-Orallo</surname><given-names>Jos&#x00E9;</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Xiting</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Jiang</surname><given-names>Liming</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Stillwell</surname><given-names>David</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Luo</surname><given-names>Fang</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Xie</surname><given-names>Xing</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>The Psychometrics Centre, Cambridge Judge Business School, University of Cambridge</institution><addr-line>Cambridge</addr-line><country>United Kingdom</country></aff><aff id="aff2"><institution>Oracle Health</institution><addr-line>Austin</addr-line><addr-line>TX</addr-line><country>United States</country></aff><aff id="aff3"><institution>Valencian Research Institute for Artificial Intelligence (VRAIN), Universitat Polit&#x00E8;cnica de Val&#x00E8;ncia</institution><addr-line>Val&#x00E8;ncia</addr-line><country>Spain</country></aff><aff id="aff4"><institution>Valencian Graduate School and Research Network of AI</institution><addr-line>Val&#x00E8;ncia</addr-line><country>Spain</country></aff><aff id="aff5"><institution>Gaoling School of Artificial Intelligence, Renmin University of China</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff6"><institution>Faculty of Psychology, Beijing Normal University</institution><addr-line>19 Xinwai Ave</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff7"><institution>Microsoft Research Asia (China)</institution><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Chrimes</surname><given-names>Dillon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Fang Luo, PhD, Faculty of Psychology, Beijing Normal University, 19 Xinwai Ave, Beijing, 100875, China, 86 15120098365; <email>luof@bnu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>26</day><month>5</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e70901</elocation-id><history><date date-type="received"><day>05</day><month>01</month><year>2025</year></date><date date-type="rev-recd"><day>16</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>17</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Luning Sun, Christopher Gibbons, Jos&#x00E9; Hern&#x00E1;ndez-Orallo, Xiting Wang, Liming Jiang, David Stillwell, Fang Luo, Xing Xie. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 26.5.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e70901"/><abstract><p>Rigorous evaluation of generalist medical artificial intelligence (GMAI) is imperative to ensure their utility and safety before implementation in health care. Current evaluation strategies rely heavily on benchmarks, which can suffer from issues with data contamination and cannot explain how GMAI might fail (lacking explanatory power) or in what circumstances (lacking predictive power). To address these limitations, we propose a new methodology to improve the quality of GMAI evaluation using construct-oriented processes. Drawing on modern psychometric techniques, we introduce approaches to construct identification and present alternative assessment formats for different domains of professional skills, knowledge, and behaviors that are essential for safe practice. We also discuss the need for human oversight in future GMAI adoption.</p></abstract><kwd-group><kwd>generalist medical artificial intelligence</kwd><kwd>psychometrics</kwd><kwd>construct-oriented evaluation</kwd><kwd>benchmark</kwd><kwd>health care</kwd><kwd>explanatory power</kwd><kwd>predictive power</kwd><kwd>data contamination</kwd><kwd>human oversight</kwd></kwd-group></article-meta></front><body><sec id="s1"><title>Generalist Medical Artificial Intelligence</title><p>Imagine that you are running a medical practice, which is recruiting a junior doctor. One candidate, named Dr. Alex Ivy (Dr. A.I.), is shortlisted, as they present excellent results in the United States Medical Licensing Examination. To determine if Dr. A.I. is ready to join the practice, how would you evaluate their competency?</p><p>It may not be long before an actual Dr. A.I., that is, an artificial intelligence (AI) system specifically designed for medicine, becomes part of our medical practice. Recent advancement in AI technology, particularly the development of foundation models, including large language models (LLMs), is enabling the application of general-purpose AI systems in health care. Termed generalist medical artificial intelligence (GMAI) [<xref ref-type="bibr" rid="ref1">1</xref>], these systems show promising performance in a wide range of health care&#x2013;related tasks. For instance, ChatGPT was able to generate clinical letters that were indistinguishable from those written by human doctors [<xref ref-type="bibr" rid="ref2">2</xref>]. Based on PaLM-2, Google developed an AI agent called articulate medical intelligence explorer (AMIE), which appeared capable of clinical history-taking and diagnostic reasoning [<xref ref-type="bibr" rid="ref3">3</xref>]. According to a recent review [<xref ref-type="bibr" rid="ref4">4</xref>], the most prevalent health care applications of LLMs include clinical decision support, medical education and examination, patient education, medical question answering, administrative tasks, and mental health support. While GMAI demonstrates versatile task capacity, rigorous evaluation is required to fully understand their capabilities and limitations and ascertain they are safe and secure before being adopted in medical practice.</p></sec><sec id="s2"><title>Benchmark-Based Evaluation and Its Limitations</title><p>Current GMAI evaluation strategies rely heavily on benchmarks, typically consisting of questions from established medical licensing examinations, such as MultiMedQA [<xref ref-type="bibr" rid="ref5">5</xref>]. The performance is usually indicated by an aggregate accuracy score, which is compared against human respondents, domain experts, or a certain passing score set for humans. This strategy lacks explanatory power, as it is unable to inform the types of errors GMAI makes, identify their weaknesses, or provide insight into GMAI&#x2019;s performance on tasks not within the benchmark assessment. For example, GPT-4 was able to achieve a passing score on the Japanese national medical licensing examinations [<xref ref-type="bibr" rid="ref6">6</xref>]. However, this seemingly promising result was coupled with the finding that LLMs sometimes endorsed prohibited choices that should be strictly avoided in clinical practice. If one overlooks the types of errors in this case, the implementation of LLMs could lead to serious medical malpractice.</p><p>In addition to the lack of explanatory power, benchmarks are also short of predictive power. An aggregate accuracy score derived from a benchmark is not useful to determine how GMAI will behave for a single case, especially in tasks that are not assessed by the benchmark or even not predefined. Given the unprecedented versatility of GMAI, they could be applied to a wide range of tasks, including those newly defined by the user, which present a challenge for evaluation. Despite the outstanding performance on existing benchmarks, it is hard to tell if GMAI will perform well for a new task, as the assumption that GMAI&#x2019;s performance on a limited number of tasks used in a benchmark directly reflects their performance in a practically infinite range of applicable tasks is unsubstantiated [<xref ref-type="bibr" rid="ref7">7</xref>]. This is particularly relevant in the fast-paced field of medicine, where patterns of disease and treatment trends will change over time, potentially leading to data drift and bias in the model output [<xref ref-type="bibr" rid="ref8">8</xref>]. Furthermore, LLMs may exhibit inconsistent performance [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>] when there is a distribution shift in the domain or style [<xref ref-type="bibr" rid="ref11">11</xref>], even subtle changes in the way in which they are prompted [<xref ref-type="bibr" rid="ref12">12</xref>]. This questions the generalizability and robustness of benchmark-based evaluation.</p><p>A related issue with benchmarks is data contamination, which suggests that benchmarks used for evaluation may have been included in the training data of foundation models [<xref ref-type="bibr" rid="ref13">13</xref>] or leaked for the model fine-tuning [<xref ref-type="bibr" rid="ref14">14</xref>]. This could result in overfitting, where the model performs well on the benchmarks but does not extrapolate to new tasks. The overestimation of the performance of a contaminated model causes misleading evaluation as well as unfair comparison with others. Considering the lack of transparency in the field [<xref ref-type="bibr" rid="ref15">15</xref>] and fierce competition for commercial success, data contamination has become a critical issue for benchmarks, undermining their reliability and validity.</p></sec><sec id="s3"><title>Beyond Benchmarks: Construct-Oriented Evaluation</title><p>The limitations of benchmark-based evaluation of GMAI highlight the need for a more comprehensive and robust evaluation method. Now let&#x2019;s revisit the scenario described at the beginning. In order to evaluate Dr. A.I., can we learn from the assessment procedure designed for human doctors? Take the United Kingdom as an example. To join the medical register, medical students need to take the Medical Licensing Assessment, which has two components: the applied knowledge test, consisting of multiple-choice questions that test the ability to apply medical knowledge to different scenarios; and the clinical and professional skills assessment, which involves responding to scenarios that might occur in medical practice. Through this carefully designed procedure, potential doctors are assessed on different domains of professional skills, knowledge, and behaviors that are essential for safe practice.</p><p>How can we evaluate GMAI on their &#x201C;professional skills, knowledge, and behaviors that are essential for safe practice&#x201D;? We propose construct-oriented evaluation, which focuses on the assessment of constructs in GMAI. Constructs, such as cognitive abilities and personality traits, are concepts that underlie clusters of related behaviors [<xref ref-type="bibr" rid="ref16">16</xref>]. These concepts facilitate the understanding of the relationship among behaviors and are also predictive of future outcomes. A well-known example is the Big Five personality model, which delineates personality into five distinct constructs that account for a large proportion of individual differences in human personality [<xref ref-type="bibr" rid="ref17">17</xref>]. Following a similar approach to the development of the Big Five personality model, Burnell et al [<xref ref-type="bibr" rid="ref18">18</xref>] extracted three factors that accounted for 82% of the variance in LLMs&#x2019; performance on 27 cognitive tasks in the HELM (Holistic Evaluation of Language Models) benchmark [<xref ref-type="bibr" rid="ref19">19</xref>]; the three factors represented the capabilities of reasoning, comprehension, and core language modelling. Conceptually grouping the 27 cognitive tasks in this way more clearly articulates the specific strengths and weaknesses of each LLM, in comparison to analyzing the aggregate accuracy score across the benchmark. It also allows for the prediction of the performance on any task that requires the same set or a subset of the constructs, even unseen ones, effectively addressing the real-world challenges such as data drift and distribution shifts. Expanding this practice to GMAI contributes to a deepened understanding of their performance and limitations as well as identifying domains that might be included in future evaluation.</p><p>As shown in the example above by Burnell et al [<xref ref-type="bibr" rid="ref18">18</xref>], constructs can be identified via a bottom-up approach that uncovers meaningful constructs in empirical data using psychometric techniques such as factor analysis. Constructs can also be determined by domain experts or based on best practices, following a top-down approach. For instance, to evaluate the conversation quality of the AI agent AMIE, evaluation rubrics were developed based on best practices for patient-centred communication in medical interviews and various criteria for the clinical and professional skills assessment in the United Kingdom [<xref ref-type="bibr" rid="ref3">3</xref>]. A broad range of communication skills, such as fostering the relationship and responding to emotions, were included in the rubrics. Subsequently, a practical assessment was carried out, where patient actors and specialists were employed to rate the performance of AMIE, according to the rubrics. By integrating top-down and bottom-up approaches, important constructs, which cover domains both within and outside current benchmarks, can be clearly defined and guide the evaluation of GMAI (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Illustration of construction-oriented evaluation with example constructs (taken from [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]) and various assessment formats.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e70901_fig01.png"/></fig><p>To measure a certain construct in GMAI, psychometrics specifies a variety of assessment formats that are not limited to test-based assessment such as benchmarks. Other formats include practical assessment, as shown in the example with AMIE [<xref ref-type="bibr" rid="ref3">3</xref>], observational assessment, situational assessment, interactive assessment, among others, all of which are commonly used to evaluate skills and behaviors in psychometrics [<xref ref-type="bibr" rid="ref16">16</xref>]. Unlike benchmarks that tend to use a fixed set of static tasks, these alternative formats are more flexible in terms of what tasks are presented and how they are presented; hence, they are more appropriate for constructs that are not covered by current benchmarks.</p><p>For example, empathy is an important competency that improves clinical outcomes and patient care experiences [<xref ref-type="bibr" rid="ref20">20</xref>]. Health care professionals, including GMAI, are expected to demonstrate empathy in their interaction with patients [<xref ref-type="bibr" rid="ref21">21</xref>]. While it might not be possible to measure empathy in a medical knowledge examination, we could simulate a conversation with GMAI to gauge their empathy. For instance, we could ask an LLM-based chatbot to carry out a conversation with a human actor who has just received some tragic news. Under such circumstances, it is not appropriate to tell a joke, which would have been acceptable as an empathetic response to someone showing negative emotions in a nonclinical scenario. By simulating the clinical settings where GMAI may be deployed, we are able to achieve robust, real-world evaluations that are not possible with traditional, narrow-scoped benchmarks [<xref ref-type="bibr" rid="ref22">22</xref>].</p><p>Notably, since no standard answers are provided, these alternative assessment formats are less susceptible to data contamination. No matter what format of assessment is adopted, psychometrics provides a scientific framework to examine its reliability and validity. For instance, in cases where multiple raters are involved, interrater reliability should be reported so that a certain level of confidence could be put into the assessment results. Such measures of quality assurance would ensure consistent and reliable assessments of subjective constructs.</p><p>When identifying and measuring constructs in GMAI, it is important that we do not assume that psychometric constructs that are traditionally developed for human traits and behaviors may fully map onto AI capabilities [<xref ref-type="bibr" rid="ref23">23</xref>]. There is also a risk of anthropomorphizing AI systems by directly applying tools made for human assessment. Necessary adjustments in the construct conceptualization and development of measurement tools are needed, considering the fundamentally different nature of AI cognition and architecture.</p><p>It is worth noting that we do not suggest that benchmarks should not be employed. Instead, we aim to provide a methodology, based on which benchmarks could be better interpreted and reliable and valid assessment instruments be developed to assess a wider range of domains of professional skills, knowledge, and behaviors that are essential for safe practice. As a matter of fact, the development of benchmarks can greatly benefit from modern psychometric techniques. For example, item response theory [<xref ref-type="bibr" rid="ref24">24</xref>], which models the probability of a correct response as a function of item parameters and the test-taker&#x2019;s level of the target construct, allows scale linking, computerized adaptive testing, and differential item functioning analysis [<xref ref-type="bibr" rid="ref25">25</xref>], improving the precision and validity of benchmarks. Mart&#x00ED;nez-Plumed et al [<xref ref-type="bibr" rid="ref26">26</xref>] have already shown that item response theory can be adapted to the analysis of AI experiments, offering insights at the instance level. To mitigate the issue of data contamination, new benchmark items with predictable item parameters could easily be developed based on automatic item generation [<xref ref-type="bibr" rid="ref27">27</xref>]. In short, we expect more instrumental roles to be played by psychometric techniques in the evaluation of GMAI.</p></sec><sec id="s4"><title>Challenge: Need for Human Oversight in Health Care</title><p>Recent regulations on the use of AI have consistently emphasized the importance of rigorous evaluation to ensure AI systems are safe and secure (eg, The EU Artificial Act and The Executive Order on the Safe, Secure, and Trustworthy Development and Use of Artificial Intelligence). This is especially necessary for the application of GMAI, which will be integrated into routine health care services [<xref ref-type="bibr" rid="ref28">28</xref>]. In the early stages of GMAI adoption, human-in-the-loop is suggested for medical decision-making so that all AI outputs are verified by health care professionals. As AI technologies rapidly progress, we are expected to move into more selective and high-level human oversight. Based on construct-oriented evaluation, which is predictive at a granular level, we can anticipate the cases where human oversight should be selectively invested. Specifically, when the AI systems are predicted to probably fail, their output should be rejected. When a clear success is predicted, their output should be accepted. Only in borderline cases is human oversight necessary. Construct-oriented evaluation also provides explanatory information about intervention. For instance, if an AI system demonstrates low empathy, we could be informed of situations where more supervision is required and the system should be improved in subsequent development. With rigorous and robust evaluation, which necessitates joint efforts of researchers and practitioners from computer science, medicine, as well as psychometrics and collaborations with health care institutions, we will be able to determine where the AI systems are reliable and where they may need more assistance, preferably at a case-by-case level that takes into account the stakes at risk, &#x201C;to ensure that AI technologies are developed and deployed responsibly, striking a balance between innovation and the safeguarding of patient well-being.&#x201D; [<xref ref-type="bibr" rid="ref29">29</xref>]</p></sec></body><back><ack><p>LS gratefully acknowledges financial support from Invesco through their philanthropic donation to Cambridge Judge Business School. XW is funded by the National Natural Science Foundation of China (Grant 62476279). FL is funded by the National Natural Science Foundation of China (Grant 62377003).</p></ack><fn-group><fn fn-type="con"><p>Conceptualization: LS, CG, JHO, XW, LJ, DS, FL, XX</p><p>Funding acquisition: DS, FL, XX</p><p>Investigation: LS</p><p>Methodology: LS, CG, JHO, XW, LJ, DS, FL, XX</p><p>Visualization: LS</p><p>Writing &#x2013; original draft: LS</p><p>Writing &#x2013; review &#x0026; editing: LS, CG, JHO, XW, LJ, DS, FL, XX</p></fn><fn fn-type="conflict"><p>CG is an employee of Oracle Health Inc., serves on the Board of Directors at the International Society for Quality of Life Research, and holds stock in Oracle Corporation. XW has previously been employed at Microsoft Research and holds stock in Microsoft. LJ has previously served as an intern at Microsoft Research. XX is an employee of Microsoft Research and holds stock in Microsoft. All other authors declare no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AMIE</term><def><p>articulate medical intelligence explorer</p></def></def-item><def-item><term id="abb2">GMAI</term><def><p>generalist medical artificial intelligence</p></def></def-item><def-item><term id="abb3">HELM</term><def><p>holistic evaluation of language models</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Abad</surname><given-names>ZSH</given-names> </name><etal/></person-group><article-title>Foundation models for generalist medical artificial intelligence</article-title><source>Nature New Biol</source><year>2023</year><month>04</month><volume>616</volume><issue>7956</issue><fpage>259</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-05881-4</pub-id><pub-id pub-id-type="medline">37045921</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ali</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Dobbs</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Hutchings</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Whitaker</surname><given-names>IS</given-names> </name></person-group><article-title>Using ChatGPT to write patient clinic letters</article-title><source>Lancet Digit Health</source><year>2023</year><month>04</month><volume>5</volume><issue>4</issue><fpage>e179</fpage><lpage>e181</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00048-1</pub-id><pub-id pub-id-type="medline">36894409</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Palepu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Saab</surname><given-names>K</given-names> </name><name name-style="western"><surname>Freyberg</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tanno</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Towards conversational diagnostic AI</article-title><year>2024</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2401.05654">http://arxiv.org/abs/2401.05654</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2401.05654</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tam</surname><given-names>TYC</given-names> </name><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>28</day><volume>7</volume><issue>1</issue><fpage>258</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id><pub-id pub-id-type="medline">39333376</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Kasai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kasai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sakaguchi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yamada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Radev</surname><given-names>D</given-names> </name></person-group><article-title>Evaluating GPT-4 and chatgpt on japanese medical licensing examinations</article-title><year>2023</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2303.18027">http://arxiv.org/abs/2303.18027</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.18027</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Hern&#x00E1;ndez-Orallo</surname><given-names>J</given-names> </name></person-group><source>The Measure of All Minds: Evaluating Natural and Artificial Intelligence</source><year>2017</year><publisher-name>Cambridge University Press</publisher-name><pub-id pub-id-type="doi">10.1017/9781316594179</pub-id><pub-id pub-id-type="other">9781316594179</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duckworth</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chmiel</surname><given-names>FP</given-names> </name><name name-style="western"><surname>Burns</surname><given-names>DK</given-names> </name><etal/></person-group><article-title>Using explainable machine learning to characterise data drift and detect emergent health risks for emergency department admissions during COVID-19</article-title><source>Sci Rep</source><year>2021</year><month>11</month><day>26</day><volume>11</volume><issue>1</issue><fpage>23017</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-02481-y</pub-id><pub-id pub-id-type="medline">34837021</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Yuan</surname><given-names>L</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cui</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>F</given-names> </name><name name-style="western"><surname>Cheng</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Revisiting out-of-distribution robustness in NLP: benchmark, analysis, and llms evaluations</article-title><source>NeurIPS</source><comment>Preprint posted online on 2023</comment><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2306.04618">http://arxiv.org/abs/2306.04618</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.04618</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>On the out-of-distribution generalization of multimodal large language models</article-title><year>2024</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2402.06599">http://arxiv.org/abs/2402.06599</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.06599</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wilhelm</surname><given-names>TI</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>R</given-names> </name></person-group><article-title>Evaluating multimodal AI in medical diagnostics</article-title><source>NPJ Digit Med</source><year>2014</year><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.1038/s41746-024-01208-3</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Sclar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tsvetkov</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Suhr</surname><given-names>A</given-names> </name></person-group><article-title>Quantifying language models&#x2019; sensitivity to spurious features in prompt design or: how I learned to start worrying about prompt formatting</article-title><year>2023</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2310.11324">http://arxiv.org/abs/2310.11324</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.11324</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Deng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gerstein</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cohan</surname><given-names>A</given-names> </name></person-group><article-title>Investigating data contamination in modern benchmarks for large language models</article-title><year>2023</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2311.09783">http://arxiv.org/abs/2311.09783</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2311.09783</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Balloccu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schmidtov&#x00E1;</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lango</surname><given-names>M</given-names> </name><name name-style="western"><surname>Du&#x0161;ek</surname><given-names>O</given-names> </name></person-group><article-title>Repeat: data contamination and evaluation malpractices in closed-source llms</article-title><year>2024</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2402.03927">http://arxiv.org/abs/2402.03927</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.03927</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riedemann</surname><given-names>L</given-names> </name><name name-style="western"><surname>Labonne</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gilbert</surname><given-names>S</given-names> </name></person-group><article-title>The path forward for large language models in medicine is open</article-title><source>NPJ Digit Med</source><year>2024</year><month>11</month><day>27</day><volume>7</volume><issue>1</issue><fpage>339</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01344-w</pub-id><pub-id pub-id-type="medline">39604549</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Rust</surname><given-names>J</given-names> </name><name name-style="western"><surname>Golombok</surname><given-names>S</given-names> </name></person-group><source>Modern Psychometrics: The Science of Psychological Assessment</source><year>2014</year><publisher-name>Routledge</publisher-name><pub-id pub-id-type="doi">10.4324/9781315787527</pub-id><pub-id pub-id-type="other">9781315787527</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldberg</surname><given-names>LR</given-names> </name></person-group><article-title>The development of markers for the big-five factor structure</article-title><source>Psychol Assess</source><year>1992</year><volume>4</volume><issue>1</issue><fpage>26</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1037/1040-3590.4.1.26</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Burnell</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hao</surname><given-names>H</given-names> </name><name name-style="western"><surname>Conway</surname><given-names>ARA</given-names> </name><name name-style="western"><surname>Orallo</surname><given-names>JH</given-names> </name></person-group><article-title>Revealing the structure of language model capabilities</article-title><year>2023</year><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2306.10062">http://arxiv.org/abs/2306.10062</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2306.10062</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bommasani</surname><given-names>R</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>T</given-names> </name></person-group><article-title>Holistic evaluation of language models</article-title><source>Ann N Y Acad Sci</source><year>2023</year><month>07</month><volume>1525</volume><issue>1</issue><fpage>140</fpage><lpage>146</lpage><pub-id pub-id-type="doi">10.1111/nyas.15007</pub-id><pub-id pub-id-type="medline">37230490</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nembhard</surname><given-names>IM</given-names> </name><name name-style="western"><surname>David</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ezzeddine</surname><given-names>I</given-names> </name><name name-style="western"><surname>Betts</surname><given-names>D</given-names> </name><name name-style="western"><surname>Radin</surname><given-names>J</given-names> </name></person-group><article-title>A systematic review of research on empathy in health care</article-title><source>Health Serv Res</source><year>2023</year><month>04</month><volume>58</volume><issue>2</issue><fpage>250</fpage><lpage>263</lpage><pub-id pub-id-type="doi">10.1111/1475-6773.14016</pub-id><pub-id pub-id-type="medline">35765156</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sorin</surname><given-names>V</given-names> </name><name name-style="western"><surname>Brin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Barash</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Large language models and empathy: systematic review</article-title><source>J Med Internet Res</source><year>2024</year><month>12</month><day>11</day><volume>26</volume><fpage>e52597</fpage><pub-id pub-id-type="doi">10.2196/52597</pub-id><pub-id pub-id-type="medline">39661968</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mehandru</surname><given-names>N</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Almaraz</surname><given-names>ER</given-names> </name><name name-style="western"><surname>Sushil</surname><given-names>M</given-names> </name><name name-style="western"><surname>Butte</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Alaa</surname><given-names>A</given-names> </name></person-group><article-title>Evaluating large language models as agents in the clinic</article-title><source>NPJ Digit Med</source><year>2024</year><month>04</month><day>3</day><volume>7</volume><issue>1</issue><fpage>84</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01083-y</pub-id><pub-id pub-id-type="medline">38570554</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hernandez-Orallo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Stillwell</surname><given-names>D</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Evaluating general-purpose AI with psychometrics</article-title><comment><ext-link ext-link-type="uri" xlink:href="http://arxiv.org/abs/2310.16379">http://arxiv.org/abs/2310.16379</ext-link></comment><pub-id pub-id-type="doi">10.48550/arXiv.2310.16379</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Embretson</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Reise</surname><given-names>SP</given-names> </name></person-group><source>Item Response Theory</source><year>2013</year><publisher-name>Psychology Press</publisher-name><pub-id pub-id-type="doi">10.4324/9781410605269</pub-id><pub-id pub-id-type="other">9781410605269</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reise</surname><given-names>SP</given-names> </name><name name-style="western"><surname>Waller</surname><given-names>NG</given-names> </name></person-group><article-title>Item response theory and clinical measurement</article-title><source>Annu Rev Clin Psychol</source><year>2009</year><volume>5</volume><fpage>27</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1146/annurev.clinpsy.032408.153553</pub-id><pub-id pub-id-type="medline">18976138</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mart&#x00ED;nez-Plumed</surname><given-names>F</given-names> </name><name name-style="western"><surname>Prud&#x00EA;ncio</surname><given-names>RBC</given-names> </name><name name-style="western"><surname>Mart&#x00ED;nez-Us&#x00F3;</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hern&#x00E1;ndez-Orallo</surname><given-names>J</given-names> </name></person-group><article-title>Item response theory in AI: Analysing machine learning classifiers at the instance level</article-title><source>Artif Intell</source><year>2019</year><month>06</month><volume>271</volume><fpage>18</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1016/j.artint.2018.09.004</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gierl</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Haladyna</surname><given-names>TM</given-names> </name></person-group><source>Automatic item generation: theory and practice</source><year>2013</year><publisher-name>Routledge</publisher-name><pub-id pub-id-type="doi">10.4324/9780203803912</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kushniruk</surname><given-names>A</given-names> </name><name name-style="western"><surname>Borycki</surname><given-names>E</given-names> </name></person-group><article-title>Barriers to and facilitators of artificial intelligence adoption in health care: scoping review</article-title><source>JMIR Hum Factors</source><year>2024</year><month>08</month><day>29</day><volume>11</volume><fpage>e48633</fpage><pub-id pub-id-type="doi">10.2196/48633</pub-id><pub-id pub-id-type="medline">39207831</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chustecki</surname><given-names>M</given-names> </name></person-group><article-title>Benefits and risks of AI in health care: narrative review</article-title><source>Interact J Med Res</source><year>2024</year><month>11</month><day>18</day><volume>13</volume><fpage>e53616</fpage><pub-id pub-id-type="doi">10.2196/53616</pub-id><pub-id pub-id-type="medline">39556817</pub-id></nlm-citation></ref></ref-list></back></article>