<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e73603</article-id><article-id pub-id-type="doi">10.2196/73603</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Models&#x2019; Clinical Decision-Making on When to Perform a Kidney Biopsy: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Toal</surname><given-names>Michael</given-names></name><degrees>MBChB</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hill</surname><given-names>Christopher</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Quinn</surname><given-names>Michael</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>O'Neill</surname><given-names>Ciaran</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Maxwell</surname><given-names>Alexander P</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Centre for Public Health, Royal Victoria Hospital, Queen&#x2019;s University Belfast</institution><addr-line>Grosvenor Road</addr-line><addr-line>Belfast</addr-line><country>United Kingdom</country></aff><aff id="aff2"><institution>Regional Centre for Nephrology and Transplantation, Belfast City Hospital</institution><addr-line>Belfast</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Rostoker</surname><given-names>Guy</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Pilgram</surname><given-names>Lisa</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Michael Toal, MBChB, Centre for Public Health, Royal Victoria Hospital, Queen&#x2019;s University Belfast, Grosvenor Road, Belfast, BT12 6BA, United Kingdom, 44 28 9097 6350; <email>mtoal11@qub.ac.uk</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>18</day><month>9</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e73603</elocation-id><history><date date-type="received"><day>07</day><month>03</month><year>2025</year></date><date date-type="rev-recd"><day>02</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>02</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Michael Toal, Christopher Hill, Michael Quinn, Ciaran O'Neill, Alexander P Maxwell. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 18.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e73603"/><abstract><sec><title>Background</title><p>Artificial intelligence (AI) and large language models (LLMs) are increasing in sophistication and are being integrated into many disciplines. The potential for LLMs to augment clinical decision-making is an evolving area of research.</p></sec><sec><title>Objective</title><p>This study compared the responses of over 1000 kidney specialist physicians (nephrologists) with the outputs of commonly used LLMs using a questionnaire determining when a kidney biopsy should be performed.</p></sec><sec sec-type="methods"><title>Methods</title><p>This research group completed a large online questionnaire for nephrologists to determine when a kidney biopsy should be performed. The questionnaire was co-designed with patient input, refined through multiple iterations, and piloted locally before international dissemination. It was the largest international study in the field and demonstrated variation among human clinicians in biopsy propensity relating to human factors such as sex and age, as well as systemic factors such as country, job seniority, and technical proficiency. The same questions were put to both human doctors and LLMs in an identical order in a single session. Eight commonly used LLMs were interrogated: ChatGPT-3.5, Mistral Hugging Face, Perplexity, Microsoft Copilot, Llama 2, GPT-4, MedLM, and Claude 3. The most common response given by clinicians (human mode) for each question was taken as the baseline for comparison. Questionnaire responses on the indications and contraindications for biopsy generated a score (0-44) reflecting biopsy propensity, in which a higher score was used as a surrogate marker for an increased tolerance of potential associated risks.</p></sec><sec sec-type="results"><title>Results</title><p>The ability of LLMs to reproduce human expert consensus varied widely with some models demonstrating a balanced approach to risk in a similar manner to humans, while other models reported outputs at either end of the spectrum for risk tolerance. In terms of agreement with the human mode, ChatGPT-3.5 and GPT-4 (OpenAI) had the highest levels of alignment, agreeing with the human mode on 6 out of 11 questions. The total biopsy propensity score generated from the human mode was 23 out of 44. Both OpenAI models produced similar propensity scores between 22 and 24. However, Llama 2 and MS Copilot also scored within this range but with poorer response alignment to the human consensus at only 2 out of 11 questions. The most risk-averse model in this study was MedLM, with a propensity score of 11, and the least risk-averse model was Claude 3, with a score of 34.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The outputs of LLMs demonstrated a modest ability to replicate human clinical decision-making in this study; however, performance varied widely between LLM models. Questions with more uniform human responses produced LLM outputs with higher alignment, whereas questions with lower human consensus showed poorer output alignment. This may limit the practical use of LLMs in real-world clinical practice.</p></sec></abstract><kwd-group><kwd>kidney biopsy</kwd><kwd>renal biopsy</kwd><kwd>nephrology</kwd><kwd>chronic kidney disease</kwd><kwd>kidney failure</kwd><kwd>proteinuria</kwd><kwd>hematuria</kwd><kwd>glomerulonephritis</kwd><kwd>machine learning</kwd><kwd>large language models</kwd><kwd>artificial intelligence</kwd><kwd>decision support</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Artificial Intelligence in Health Care</title><p>The rapid expansion of artificial intelligence (AI) has impacted numerous disciplines over recent decades. This technology aims to improve efficiency; however, there are concerns that human roles may be replaced and that autonomous AI could cause significant disruption to societies [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. AI is now seamlessly integrated into everyday life, and common functions such as predictive texting, review summaries, and customer service chatbots rely on this technology. Generative AI can be used to rapidly synthesize and assist with the creation of images and text, which has led academic institutions to consider how to effectively undertake assessments. Large language models (LLMs), such as ChatGPT, Copilot, and Llama, have rapidly proliferated to employ these developments for personal or professional use.</p><p>AI has also been used effectively in health care, and further expansion is predicted in the years ahead [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>]. There are many demands on health care resources across the world [<xref ref-type="bibr" rid="ref5">5</xref>], and AI offers opportunities to automate routine human tasks, allowing human practitioners to use their time more effectively on complex problems that are currently beyond the scope of AI [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. In some circumstances, chatbot-generated outputs have even been found to be of higher quality and convey deeper empathy than human responses [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. Diagnostic specialties offer a good template for this illustration. In pathology, AI has been used to rapidly characterize the nature of lesions for rapid detection and coding, allowing the pathologist to analyze specimens with greater efficiency [<xref ref-type="bibr" rid="ref10">10</xref>]. In radiology, similar pattern recognition has been used to quickly identify abnormalities, as well as help the radiologist prioritize their workflow, so that the most abnormal or urgent scans are reported first [<xref ref-type="bibr" rid="ref7">7</xref>].</p></sec><sec id="s1-2"><title>Limitations of Artificial Intelligence</title><p>The rise in AI usage has raised significant concerns. Intelligence is not equivalent to wisdom, and AI outputs are dependent on the data used to train these models. Although Generative AI can produce a detailed response, one criticism is that its output lacks the &#x201C;common sense&#x201D; of humans [<xref ref-type="bibr" rid="ref11">11</xref>]. LLMs can generate false information in the form of hallucinations and produce gender- or racially biased outputs [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. LLMs are sensitive to phrasing and can generate errors by varying the order of words [<xref ref-type="bibr" rid="ref14">14</xref>]. How each company trains its LLMs remains confidential, and this lack of transparency is another cause of concern [<xref ref-type="bibr" rid="ref3">3</xref>]. Using AI within health care is dependent on its alignment with human values to establish trust from service users, which is another challenge of any new technology, especially given the issues discussed [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s1-3"><title>Clinical Decision Support</title><p>Medical practitioners make numerous clinical decisions throughout their working day. How they make these decisions remains poorly understood and open to many potential biases and influences [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Given that some of these decisions may stand between life and death, harnessing AI to assist physicians in making the best clinical decisions for each patient based on the available body of evidence may represent an opportunity to improve efficiency and enhance safe patient care. LLM outputs have been found to be superior to junior surgical residents&#x2019; clinical decision-making but inferior to that of senior colleagues; however, in these studies, LLM outputs were limited by inconsistencies and inaccuracies [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. LLMs have been shown to easily pass high-stakes written medical examinations such as the United States Medical Licensing Examination (USMLE) [<xref ref-type="bibr" rid="ref20">20</xref>] and the Membership of the Royal College of Physicians of the United Kingdom (MRCP(UK)) [<xref ref-type="bibr" rid="ref21">21</xref>]; however, they appear to perform poorly in questions related to rare diseases, perhaps due to a paucity of training data [<xref ref-type="bibr" rid="ref22">22</xref>]. In the Polish nephrology specialty examination, GPT-4 performed at a level similar to the average human candidate but below that of the top candidates [<xref ref-type="bibr" rid="ref23">23</xref>].</p><p>Our research group completed a large international survey of physicians&#x2019; clinical decision-making, recruiting over 1000 doctors from 83 countries to complete a short online questionnaire [<xref ref-type="bibr" rid="ref24">24</xref>]. In this study, nephrologists (kidney specialists) were asked to determine when a kidney biopsy was required using clinical scenarios of potential indications and contraindications. A kidney biopsy is used to define the type of kidney disease a patient has so that appropriate treatment can be administered. A biopsy is an invasive investigation with a small but significant risk of serious bleeding complications [<xref ref-type="bibr" rid="ref25">25</xref>]. The use of AI in nephrology is increasing with recent studies assessing LLM usage for guideline adherence, dialysis management, and specialist examinations [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>We aimed to compare the responses of over 1000 human doctors with those of LLMs, using the same questions on biopsy practice in the same order, to determine if AI can be used as a clinical decision tool in a safe and effective manner.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Questionnaire Design</title><p>The detailed methods for the questionnaire have been described in a previous paper [<xref ref-type="bibr" rid="ref24">24</xref>]. In brief, the questionnaire was designed for kidney specialist doctors (nephrologists) to investigate the indications and contraindications for a kidney biopsy. The research team, consisting of 4 nephrologists and 1 health economist, co-designed the questionnaire with input from patient participants. This instrument was refined through multiple iterations and a pilot study undertaken in a group of 39 clinicians before wider dissemination. A biopsy propensity score between 0&#x2010;44 was generated based on the responses to 11 questions (0&#x2010;4) on indications and contraindications, with a higher score demonstrating an increased propensity to recommend biopsy in a given scenario and therefore a greater tolerance of the associated risks. Scores of 0&#x2010;44 were normalized to 0%&#x2010;100% for clarity. This allowed comparisons between nephrologists to determine if they were more or less likely to recommend this investigation when placed in an identical clinical situation. For each question, respondents were asked to select 1 of 5 possible responses to the prompt. For the clinical vignettes on indications, this was on a Likert scale from &#x201C;definitely yes&#x201D; to &#x201C;definitely no,&#x201D; and for contraindications, by defining a threshold of acceptable risk for clinical parameters associated with bleeding complications. The most common response (mode) for each question given by human respondents was determined to be the baseline for comparison with LLM outputs. For each question, the mode was selected by a minimum of 345 and a maximum of 728 human clinicians.</p></sec><sec id="s2-2"><title>LLM Application</title><p>Responses to the human questionnaire were collected from August 2023 to January 2024. LLMs were interrogated from March 2024 to June 2024. At this time, the results were not publicly available and therefore could not have been part of the evidence base used by the LLM to generate responses. The questions put to the LLM were identical (except for removing the words &#x201C;in your opinion&#x201D;) to those presented to human clinicians. They were also presented in the same order in a single session. The full transcripts generated by the LLMs are included in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and were reviewed by the first author to assign scoring, which was subsequently reviewed and verified by two other coauthors (CON and APM).</p><p>A propensity score was generated for each LLM based on the responses to these 11 questions using the same scoring method as for human respondents. Therefore, an LLM that generated a higher score would be more inclined to recommend this investigation and therefore less risk averse. By contrast, a lower score would be indicative of being less inclined to recommend this investigation and more risk averse. An LLM was determined as being a perfect match to human clinicians if the answer selected was identical to the mode in the human questionnaire.</p></sec><sec id="s2-3"><title>Ethical Considerations</title><p>Ethical approval for this project was granted by the Faculty of Medicine, Health, and Life Sciences Research Ethics Committee of Queen&#x2019;s University, Belfast (project MHLS 22_175) on February 15, 2023, and was conducted in accordance with the Declaration of Helsinki. Human participants completed an online questionnaire about kidney biopsy practice [<xref ref-type="bibr" rid="ref24">24</xref>]. A statement giving consent to participate was displayed to the clinician on the first screen of the questionnaire. No identifying information was collected. No compensation was provided to participants.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Human Respondents&#x2019; Characteristics</title><p>A total of 1181 clinicians from 83 countries participated in the study. A summary of clinician characteristics is given in <xref ref-type="table" rid="table1">Table 1</xref>. The study was open to nephrologist trainees and fellows who comprised 14.3% (n=168) of the total cohort.</p><p>The United States has the largest single national group, and 43 states were represented in this cohort. The 4 devolved nations in the United Kingdom were also represented in the second largest cumulative group. Thirteen nations had more than 20 clinicians included.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Characteristics of human participants.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Characteristics</td><td align="left" valign="bottom">Values, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Sex</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Male</td><td align="left" valign="top">753 (64.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Female</td><td align="left" valign="top">408 (34.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Prefer not to say</td><td align="left" valign="top">9 (0.8)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Nonbinary/third gender</td><td align="left" valign="top">2 (0.2)</td></tr><tr><td align="left" valign="top">Age (y)</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>20&#x2010;29</td><td align="left" valign="top">30 (2.5)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>30&#x2010;39</td><td align="left" valign="top">442 (37.5)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>40&#x2010;49</td><td align="left" valign="top">327 (27.7)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>50&#x2010;59</td><td align="left" valign="top">251 (21.3)</td></tr><tr><td align="char" char="." valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>60 or older</td><td align="left" valign="top">130 (11.0)</td></tr><tr><td align="left" valign="top">Current job title</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Trainee/fellow</td><td align="left" valign="top">168 (14.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Associate specialist/specialty doctor</td><td align="left" valign="top">122 (10.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Consultant/attending physician</td><td align="left" valign="top">733 (62.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Clinical director or professor</td><td align="left" valign="top">154 (13.1)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other</td><td align="left" valign="top">1 (0.1)</td></tr><tr><td align="left" valign="top">Continent of practice</td><td align="left" valign="top">&#x2003;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Europe</td><td align="left" valign="top">405 (34.4)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>North America</td><td align="left" valign="top">352 (29.9)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>South America</td><td align="left" valign="top">85 (7.2)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Asia</td><td align="left" valign="top">216 (18.3)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Africa</td><td align="left" valign="top">67 (5.7)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Oceania</td><td align="left" valign="top">54 (4.6)</td></tr></tbody></table></table-wrap></sec><sec id="s3-2"><title>LLM Interrogation</title><p>A total of 8 LLMs were interrogated, as detailed in <xref ref-type="table" rid="table2">Table 2</xref>. The full transcripts of the dialogues are detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The outputs produced by the LLMs varied greatly in terms of detail; however, each LLM was instructed to choose an answer from 5 options. Some LLMs selected more than 1 answer to certain prompts, refused to give an answer, or produced incomplete sentences; however, in most instances, the question was answered as instructed. An introductory prompt for context was added for GPT-4. The programs that were free to use without subscription were interrogated by the first author. GPT-4 and MedLM were not freely available; therefore, an additional operator with access was employed to reproduce these methods.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Large language models (LLMs) used and dates of interrogation.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">LLM</td><td align="left" valign="bottom">Date of interrogation</td><td align="left" valign="bottom">Availability</td></tr></thead><tbody><tr><td align="left" valign="top">OpenAI: ChatGPT-3.5</td><td align="left" valign="top">March 27, 2024</td><td align="left" valign="top">Free without subscription</td></tr><tr><td align="left" valign="top">Mistral Hugging Face</td><td align="left" valign="top">March 28, 2024</td><td align="left" valign="top">Free without subscription</td></tr><tr><td align="left" valign="top">Perplexity</td><td align="left" valign="top">March 28, 2024</td><td align="left" valign="top">Free without subscription</td></tr><tr><td align="left" valign="top">Microsoft Copilot</td><td align="left" valign="top">March 28, 2024</td><td align="left" valign="top">Free without subscription</td></tr><tr><td align="left" valign="top">Llama 2 13b chatbot</td><td align="left" valign="top">April 3, 2024</td><td align="left" valign="top">Free without subscription</td></tr><tr><td align="left" valign="top">OpenAI: GPT-4</td><td align="left" valign="top">April 22, 2024</td><td align="left" valign="top">Subscription</td></tr><tr><td align="left" valign="top">MedLM</td><td align="left" valign="top">April 26, 2024</td><td align="left" valign="top">Subscription</td></tr><tr><td align="left" valign="top">Claude 3</td><td align="left" valign="top">June 13, 2024</td><td align="left" valign="top">Free without subscription</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>LLM Prompts</title><p>Clinicians were asked whether, in their opinion, a kidney biopsy was required in the setting of 7 fictional clinical vignettes. All cases were adults with unexplained abnormalities in kidney function, reported as estimated glomerular filtration rate and urinary tests (hematuria or proteinuria quantified as grams per day). Four cases were a first presentation to a nephrologist, and in three, there was a dynamic change over the course of a year.</p><p>The determination of when clinicians felt the risk of kidney biopsy outweighed the benefits was explored in a section on potential contraindications, particularly relating to bleeding risk. In the first section, clinicians were presented with 5 options and asked for the limits of acceptable parameters to proceed to biopsy. This could be the minimum level (eg, hemoglobin) or maximum level (eg, systolic blood pressure). The question prompts given to the LLMs are detailed in <xref ref-type="table" rid="table3">Table 3</xref>.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Question prompts given to large language models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question code</td><td align="left" valign="bottom">Full question</td></tr></thead><tbody><tr><td align="left" valign="top">Q1</td><td align="left" valign="top">Is a renal biopsy required for an adult in the first detection of an unexplained nephrotic syndrome of proteinuria 4g/day, peripheral oedema and eGFR<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>&#x003E;60 ml/min/1.73m<sup>2</sup>? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q2</td><td align="left" valign="top">Is a renal biopsy required for an adult in the first detection of unexplained non-visible haematuria, 2g/day of proteinuria and eGFR 40? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q3</td><td align="left" valign="top">Is a renal biopsy required for an adult in the first detection of unexplained non-visible haematuria, 2g/day of proteinuria and eGFR 20 with normal kidney appearances on ultrasound? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q4</td><td align="left" valign="top">Is a renal biopsy required for an adult in the first detection of unexplained non-visible haematuria, 2g/day of proteinuria and eGFR 20 with reduced kidney size on ultrasound? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q5</td><td align="left" valign="top">Is a renal biopsy required for an adult with an unexplained rise in proteinuria from 0.5 to 2g/day in one year with an eGFR&#x003E;60? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q6</td><td align="left" valign="top">Is a renal biopsy required for an adult with an unexplained fall in eGFR from 55 to 40 in one year with proteinuria stable at 0.5g/day? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q7</td><td align="left" valign="top">Is a renal biopsy required for an adult with an unexplained fall in eGFR from 55 to 40 AND rise in proteinuria from 0.5 to 2 g/day in one year? Choose from Definitely yes, Probably yes, Unsure, Probably not, Definitely not</td></tr><tr><td align="left" valign="top">Q8</td><td align="left" valign="top">What is the minimum acceptable Haemoglobin for native renal biopsy? Choose from 100g/l, 90 g/l, 80 g/l, Other (please specify) and No minimum level</td></tr><tr><td align="left" valign="top">Q9</td><td align="left" valign="top">What is the minimum acceptable Platelet count for native renal biopsy? Choose from 150&#x00D7;109, 100&#x00D7;109, 50&#x00D7;109, Other (please specify), No minimum level</td></tr><tr><td align="left" valign="top">Q10</td><td align="left" valign="top">What is the maximum acceptable International Normalised Ratio for native renal biopsy? Choose from 1.2, 1.4, 1.6, Other (please specify) and No maximum level</td></tr><tr><td align="left" valign="top">Q11</td><td align="left" valign="top">What is the maximum acceptable Systolic Blood Pressure for native renal biopsy? Choose from 140 mmHg, 160 mmHg, 180 mmHg, Other (please specify) and No maximum level</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>eGFR: estimated glomerular filtration rate.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Comparing Human Doctor and LLM Responses</title><p>In the human questionnaire, all available options were selected by at least 3 and at most 728 human doctors. Therefore, none of the LLM outputs could be considered outside the range of responses that a human doctor may select. The subject of kidney biopsy decision-making was chosen for this clinician questionnaire because it is subjective; therefore, there is a range of acceptable answers. The mode of each answer represents varying proportions of responses to each question. For question 8, there was no clear consensus, and the mode was selected by only 31.6% of respondents; however, for question 9, the consensus was clearer, and 66.5% of respondents selected the mode. Responses are detailed in <xref ref-type="table" rid="table4">Table 4</xref>.</p><p>The level of agreement between the mode of human responses and LLM outputs ranged from 0 out of 11 (Mistral Hugging Face) to 6 out of 11 (ChatGPT-3.5 and GPT-4). Four of eight LLMs generated a biopsy propensity score that was equal to or within one point of the human mode score (ChatGPT-3.5, GPT-4, MS CoPilot, and Llama 2).</p><p>Using this propensity score as a surrogate marker for clinical risk aversion, the most risk-averse LLM output was MedLM with a score of 11, which produced outputs equivalent to the lowest 1% of biopsy propensity scores in human respondents. By contrast, the Claude 3 output produced the highest biopsy propensity score of 34, indicating the lowest level of risk aversion, a score higher than 99% of human respondents. For both MedLM and Claude 3, there was a reasonable agreement between outputs and human responses with 4 or 5 exact matches out of 11; however, the overall approach to risk, as indicated by the propensity score, was not typical of human responses.</p><p>In terms of which LLM most accurately represented human doctor responses, the two OpenAI LLMs, ChatGPT-3.5 and GPT-4, were the optimal programs for agreement with the human mode and profile of risk aversion, as indicated by the propensity score.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Comparison of questionnaire responses between human consensus and large language models (LLMs).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Question</td><td align="left" valign="bottom">Humans (N=1181)<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>, n (%)<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">ChatGPT3.5</td><td align="left" valign="bottom">Mistral Hugging Face</td><td align="left" valign="bottom">Perplexity</td><td align="left" valign="bottom">MS Copilot</td><td align="left" valign="bottom">Llama 2 13b chat</td><td align="left" valign="bottom">GPT4</td><td align="left" valign="bottom">MedLM</td><td align="left" valign="bottom">Claude 3</td></tr></thead><tbody><tr><td align="left" valign="top">Q1</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup>, 655 (58.1)</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">NA<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup></td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PY</td><td align="left" valign="top">U<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup></td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup> and PY</td><td align="left" valign="top">PY</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Q2</td><td align="left" valign="top">DY, 659 (59)</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">NA</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PY</td><td align="left" valign="top">PY</td><td align="left" valign="top">PY</td><td align="left" valign="top">PN<sup><xref ref-type="table-fn" rid="table4fn8">h</xref></sup></td><td align="left" valign="top">PY</td></tr><tr><td align="left" valign="top">Q3</td><td align="left" valign="top">DY, 571 (51<italic>.</italic>2)</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">NA</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PY</td><td align="left" valign="top">U</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PN</td><td align="left" valign="top">DY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Q4</td><td align="left" valign="top">PN, 571 (51.4)</td><td align="left" valign="top">DY</td><td align="left" valign="top">NA</td><td align="left" valign="top">DY</td><td align="left" valign="top">PY</td><td align="left" valign="top">DY</td><td align="left" valign="top">PN<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup> and DN<sup><xref ref-type="table-fn" rid="table4fn9">i</xref></sup></td><td align="left" valign="top">PN<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">DY</td></tr><tr><td align="left" valign="top">Q5</td><td align="left" valign="top">PY, 521 (47.0)</td><td align="left" valign="top">PN</td><td align="left" valign="top">NA</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">U</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PN</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Q6</td><td align="left" valign="top">PN, 398 (36.1)</td><td align="left" valign="top">PN<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">NA</td><td align="left" valign="top">PY</td><td align="left" valign="top">PY</td><td align="left" valign="top">U</td><td align="left" valign="top">U</td><td align="left" valign="top">PN<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">PY</td></tr><tr><td align="left" valign="top">Q7</td><td align="left" valign="top">PY, 565 (51.7)</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">NA</td><td align="left" valign="top">DY</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">U and PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">DY</td><td align="left" valign="top">PY<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">DY</td></tr><tr><td align="left" valign="top">Q8 Hb<sup><xref ref-type="table-fn" rid="table4fn10">j</xref></sup> (g/l)</td><td align="left" valign="top">90, 345 (31.6)</td><td align="left" valign="top">100</td><td align="left" valign="top">80</td><td align="left" valign="top">Other</td><td align="left" valign="top">Other</td><td align="left" valign="top">80</td><td align="left" valign="top">100</td><td align="left" valign="top">90<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">80</td></tr><tr><td align="left" valign="top">Q9 Plat<sup><xref ref-type="table-fn" rid="table4fn11">k</xref></sup> (&#x00D7;10<sup>9</sup>)</td><td align="left" valign="top">100, 728 (66.5)</td><td align="left" valign="top">100<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">50</td><td align="left" valign="top">50</td><td align="left" valign="top">100<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">150</td><td align="left" valign="top">100<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">100<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">50</td></tr><tr><td align="left" valign="top">Q10 INR<sup><xref ref-type="table-fn" rid="table4fn12">l</xref></sup></td><td align="left" valign="top">1.2, 557 (51.1)</td><td align="left" valign="top">1.4</td><td align="left" valign="top">1.4</td><td align="left" valign="top">1.5</td><td align="left" valign="top">1.5</td><td align="left" valign="top">1.4</td><td align="left" valign="top">1.4</td><td align="left" valign="top">1.4</td><td align="left" valign="top">1.5</td></tr><tr><td align="left" valign="top">Q11 SBP<sup><xref ref-type="table-fn" rid="table4fn13">m</xref></sup> (mmHg)</td><td align="left" valign="top">160, 600 (54.7)</td><td align="left" valign="top">160<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">140</td><td align="left" valign="top">140</td><td align="left" valign="top">140</td><td align="left" valign="top">140</td><td align="left" valign="top">160<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td><td align="left" valign="top">140</td><td align="left" valign="top">160<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Total score</td><td align="left" valign="top">23</td><td align="left" valign="top">23</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table4fn14">n</xref></sup></td><td align="left" valign="top">29</td><td align="left" valign="top">23</td><td align="left" valign="top">22</td><td align="left" valign="top">22&#x2010;24</td><td align="left" valign="top">11</td><td align="left" valign="top">34</td></tr><tr><td align="left" valign="top">Agreement</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">6/11</td><td align="left" valign="top">0/11</td><td align="left" valign="top">4/11</td><td align="left" valign="top">2/11</td><td align="left" valign="top">2/11</td><td align="left" valign="top">6/11</td><td align="left" valign="top">5/11</td><td align="left" valign="top">4/11</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>There are small variations in the numbers of human participants who answered each question, and the denominator of the percentage is derived from the number of humans who answered each question, rather than total participants in the study. More details are provided on this in the following study [<xref ref-type="bibr" rid="ref24">24</xref>].</p></fn><fn id="table4fn2"><p><sup>b</sup>Most common response (mode) given by human participants. Numbers in parentheses represent the proportion of human respondents who selected the mode for each question. </p></fn><fn id="table4fn3"><p><sup>c</sup>DY: definitely yes.</p></fn><fn id="table4fn4"><p><sup>d</sup>PY: probably yes.</p></fn><fn id="table4fn5"><p><sup>e</sup>NA: no answer given.</p></fn><fn id="table4fn6"><p><sup>f</sup>LLM output contains mode of human responses.</p></fn><fn id="table4fn7"><p><sup>g</sup>U: unsure.</p></fn><fn id="table4fn8"><p><sup>h</sup>PN: probably not.</p></fn><fn id="table4fn9"><p><sup>i</sup>DN: definitely not.</p></fn><fn id="table4fn10"><p><sup>j</sup>Hb: hemoglobin.</p></fn><fn id="table4fn11"><p><sup>k</sup>Plat: platelet count.</p></fn><fn id="table4fn12"><p><sup>l</sup>INR: international normalized ratio.</p></fn><fn id="table4fn13"><p><sup>m</sup>SBP: systolic blood pressure.</p></fn><fn id="table4fn14"><p><sup>n</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>In this study, the questionnaire responses of nephrologists on clinical decision-making were replicated by some LLMs. The degree of fidelity differed among LLMs, and the OpenAI models ChatGPT-3.5 and GPT-4 produced outputs that were the most consistent with typical clinician responses. Similar to our study of human clinicians, most of the LLMs interrogated opted for a balanced approach to this dilemma, producing comparable responses about when to perform a kidney biopsy and when to avoid this procedure.</p><p>There were varying degrees of agreement among human respondents, with the mode selected by 31.6% to 66.5% of respondents. This variation brings ambiguity to the &#x201C;ground-truth&#x201D; in each scenario with inconsistent dispersion of answers. This spectrum of consensus was also replicated among LLMs. In question 8, where only 31.6% of human respondents selected the mode, the mode was selected by only 1 out of 8 LLMs, the lowest in our study. Conversely, in question 9 with the highest agreement, where 66.5% of humans selected the mode, this human mode was also selected by 4 out of 8 LLMs, the joint highest in our study. This suggests that the gray areas of ambiguity in clinical decision-making can also be reflected in the LLM outputs. One potential use for this technology would be to assist the clinician in resolving an uncertain decision; however, in this instance, this uncertainty is also reflected in LLM outputs, limiting their utility in real-world clinical practice.</p><p>The propensity to perform an invasive kidney biopsy procedure is inevitably linked to tolerance of potential risks. Therefore, we used the propensity score as a surrogate marker for risk aversion among human clinicians. When this score was applied to LLMs, there was variable risk aversion among these models. MedLM outputs were the most risk-averse, indicating a higher threshold to perform a kidney biopsy, as well as a low tolerance for potential contraindications that would increase the risk of a bleeding complication. In contrast, the outputs for Claude 3 were the least risk-averse, meaning every clinical vignette was met with a response that a kidney biopsy was definitely or probably required, and the lower limits for potential contraindications could be considered by some clinicians to be reckless.</p><p>The length and detail of outputs generated by each LLM were variable, as described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Some models such as ChatGPT-3.5 and MedLM answered the question directly, with limited additional discussion of the reasoning behind decisions. Other models such as Llama 2 and Microsoft Copilot produced detailed responses outlining the dilemma and the known variances in practice before reaching a conclusion. The updated OpenAI model GPT-4 produced much longer outputs based on mean word count (164 words) compared to its predecessor ChatGPT-3.5 (19 words).</p><p>AI applications are currently in development to analyze large volumes of free text to allow for organized coding of data for research and analytic purposes [<xref ref-type="bibr" rid="ref6">6</xref>]. LLMs are improving rapidly, and specialized medical LLMs have demonstrated significant improvement with continual pretraining and instructional fine-tuning for tasks such as question answering, summarization, disease classification, and natural language inference [<xref ref-type="bibr" rid="ref30">30</xref>]. A US study compared human and LLM capabilities in detecting adverse events from a cannabinoid-based product from posts on a social media group, using human evaluators as a benchmark. In this study, ChatGPT-3.5 was able to detect any adverse events with 95% agreement with humans, and 99% agreement for serious events [<xref ref-type="bibr" rid="ref31">31</xref>]. However, LLMs are not sufficiently reliable for clinical care, as using AI scribes for physicians&#x2019; notes has produced text with significant errors, both by omission and by the inclusion of false statements [<xref ref-type="bibr" rid="ref32">32</xref>].</p></sec><sec id="s4-2"><title>Limitations</title><p>This study has several limitations that should be considered. First, this is a small sample of 11 questions used to interrogate LLMs; therefore, there is limited depth to this data, and caution is required not to overinterpret the reported results. Second, this study assessed the LLMs&#x2019; ability to make decisions based on short, simple case vignettes, and this may not necessarily be generalizable to more nuanced and complex &#x201C;real-life&#x201D; clinical scenarios, as LLM accuracy has been shown to be poorer on longer questions [<xref ref-type="bibr" rid="ref21">21</xref>]. Third, using a mode as the human benchmark is a limitation for questions with poor consensus, where the &#x201C;ground-truth&#x201D; is less evident; moreover, all human responses were treated as equal, despite vastly differing levels of clinical experience.</p></sec><sec id="s4-3"><title>Strengths</title><p>This study also has notable strengths. Human decision-making is poorly understood, and clinical decisions should be based on integrating the best available evidence for the care of an individual patient. AI-assisted decision aids are rapidly expanding into medicine, and this is the first study to our knowledge that compares a large sample of human responses to LLM outputs based on identical scenarios.</p></sec><sec id="s4-4"><title>Implications for Future Research</title><p>There has been a rapid proliferation of medical research into the use of AI in health care; however, how these tools are best integrated into clinical practice remains unclear. As LLMs continue to increase in sophistication and accuracy, AI assistance will likely become integral to all aspects of life. How best to apply this technology in health care remains a challenge to be addressed in the coming years. It is important that LLM outputs align with human values, which can be achieved through supervised reinforcement learning with input from expert physicians and patients [<xref ref-type="bibr" rid="ref15">15</xref>].</p></sec><sec id="s4-5"><title>Conclusions</title><p>Some LLMs can modestly replicate human clinical decision-making when short clinical vignettes are presented. There is variable performance in these models; however, ChatGPT-3.5 and GPT-4 outputs were the most consistent with humans in our study. Caution should be applied when considering how these LLMs can be used to assist clinicians, as there remain many unanswered questions as to how physicians should use these tools for safe and effective patient care.</p></sec></sec></body><back><ack><p>The authors would like to thank Mr Marc McNicholl and Mr Tushar Gandhi for their assistance in reproducing the applied methods for paid services.</p><p>MT is supported by a clinical research fellowship award from the Northern Ireland Kidney Research Fund. Financial support for publication charges was provided by the Belfast Health and Social Care Trust Charitable Trust Funds committee. Neither organization had input into the design or conduct of this study.</p><p>The authors report that no artificial intelligence tools were used in the creation of this manuscript.</p></ack><notes><sec><title>Data Availability</title><p>Data is available upon reasonable request by contacting the corresponding author.</p></sec></notes><fn-group><fn fn-type="con"><p>Funding acquisition: MT</p><p>Conceptualization: MT</p><p>Study development: MT</p><p>Formal analysis: MT</p><p>Writing &#x2013; original draft: MT</p><p>Supervision: APM, CON, CH, and MQ</p><p>Writing &#x2013; review &#x0026; editing: APM, CON, CH, and MQ</p><p>Validation: APM and CON</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MRCP(UK)</term><def><p>Membership of the Royal College of Physicians</p></def></def-item><def-item><term id="abb4">USMLE</term><def><p>United States Medical Licensing Examination</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name></person-group><article-title>Large language models will not replace healthcare professionals: curbing popular fears and hype</article-title><source>J R Soc Med</source><year>2023</year><month>05</month><volume>116</volume><issue>5</issue><fpage>181</fpage><lpage>182</lpage><pub-id pub-id-type="doi">10.1177/01410768231173123</pub-id><pub-id pub-id-type="medline">37199678</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hager</surname><given-names>P</given-names> </name><name name-style="western"><surname>Jungmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Holland</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Evaluation and mitigation of the limitations of large language models in clinical decision-making</article-title><source>Nat Med</source><year>2024</year><month>09</month><volume>30</volume><issue>9</issue><fpage>2613</fpage><lpage>2622</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03097-1</pub-id><pub-id pub-id-type="medline">38965432</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name></person-group><article-title>How can the clinical aptitude of AI assistants be assayed?</article-title><source>J Med Internet Res</source><year>2023</year><month>12</month><day>5</day><volume>25</volume><fpage>e51603</fpage><pub-id pub-id-type="doi">10.2196/51603</pub-id><pub-id pub-id-type="medline">38051572</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCoy</surname><given-names>LG</given-names> </name><name name-style="western"><surname>Manrai</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Large language models and the degradation of the medical record</article-title><source>N Engl J Med</source><year>2024</year><month>10</month><day>31</day><volume>391</volume><issue>17</issue><fpage>1561</fpage><lpage>1564</lpage><pub-id pub-id-type="doi">10.1056/NEJMp2405999</pub-id><pub-id pub-id-type="medline">39465898</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Urquhart</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hartigan</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A pilot feasibility study comparing large language models in extracting key information from ICU patient text records from an Irish population</article-title><source>Intensive Care Med Exp</source><year>2024</year><month>08</month><day>16</day><volume>12</volume><issue>1</issue><fpage>71</fpage><pub-id pub-id-type="doi">10.1186/s40635-024-00656-1</pub-id><pub-id pub-id-type="medline">39147878</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Najjar</surname><given-names>R</given-names> </name></person-group><article-title>Redefining radiology: a review of artificial intelligence integration in medical imaging</article-title><source>Diagnostics (Basel)</source><year>2023</year><month>08</month><day>25</day><volume>13</volume><issue>17</issue><fpage>2760</fpage><pub-id pub-id-type="doi">10.3390/diagnostics13172760</pub-id><pub-id pub-id-type="medline">37685300</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Poliak</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparing physician and artificial intelligence chatbot responses to patient questions posted to a public social media forum</article-title><source>JAMA Intern Med</source><year>2023</year><month>06</month><day>1</day><volume>183</volume><issue>6</issue><fpage>589</fpage><lpage>596</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2023.1838</pub-id><pub-id pub-id-type="medline">37115527</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Perlis</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Goldberg</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Ostacher</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Schneck</surname><given-names>CD</given-names> </name></person-group><article-title>Clinical decision support for bipolar depression using large language models</article-title><source>Neuropsychopharmacology</source><year>2024</year><month>08</month><volume>49</volume><issue>9</issue><fpage>1412</fpage><lpage>1416</lpage><pub-id pub-id-type="doi">10.1038/s41386-024-01841-2</pub-id><pub-id pub-id-type="medline">38480911</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hermsen</surname><given-names>M</given-names> </name><name name-style="western"><surname>de Bel</surname><given-names>T</given-names> </name><name name-style="western"><surname>den Boer</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Deep learning-based histopathologic assessment of kidney tissue</article-title><source>J Am Soc Nephrol</source><year>2019</year><month>10</month><volume>30</volume><issue>10</issue><fpage>1968</fpage><lpage>1979</lpage><pub-id pub-id-type="doi">10.1681/ASN.2019020144</pub-id><pub-id pub-id-type="medline">31488607</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kejriwal</surname><given-names>M</given-names> </name><name name-style="western"><surname>Santos</surname><given-names>H</given-names> </name><name name-style="western"><surname>Mulvehill</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>K</given-names> </name><name name-style="western"><surname>McGuinness</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Lieberman</surname><given-names>H</given-names> </name></person-group><article-title>Can AI have common sense? Finding out will be key to achieving machine intelligence</article-title><source>Nature New Biol</source><year>2024</year><month>10</month><day>10</day><volume>634</volume><issue>8033</issue><fpage>291</fpage><lpage>294</lpage><pub-id pub-id-type="doi">10.1038/d41586-024-03262-z</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zack</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lehman</surname><given-names>E</given-names> </name><name name-style="western"><surname>Suzgun</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessing the potential of GPT-4 to perpetuate racial and gender biases in health care: a model evaluation study</article-title><source>Lancet Digit Health</source><year>2024</year><month>01</month><volume>6</volume><issue>1</issue><fpage>e12</fpage><lpage>e22</lpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00225-X</pub-id><pub-id pub-id-type="medline">38123252</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Che</surname><given-names>S</given-names> </name><name name-style="western"><surname>Mao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>X</given-names> </name></person-group><article-title>Bias of AI-generated content: an examination of news produced by large language models</article-title><source>Sci Rep</source><year>2024</year><month>03</month><day>4</day><volume>14</volume><issue>1</issue><fpage>5224</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-55686-2</pub-id><pub-id pub-id-type="medline">38433238</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salihu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gadiri</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Skalidis</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Towards AI-assisted cardiology: a reflection on the performance and limitations of using large language models in clinical decision-making</article-title><source>EuroIntervention</source><year>2023</year><month>12</month><day>4</day><volume>19</volume><issue>10</issue><fpage>e798</fpage><lpage>e801</lpage><pub-id pub-id-type="doi">10.4244/EIJ-D-23-00461</pub-id><pub-id pub-id-type="medline">38050992</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>KH</given-names> </name><name name-style="western"><surname>Healey</surname><given-names>E</given-names> </name><name name-style="western"><surname>Leong</surname><given-names>TY</given-names> </name><name name-style="western"><surname>Kohane</surname><given-names>IS</given-names> </name><name name-style="western"><surname>Manrai</surname><given-names>AK</given-names> </name></person-group><article-title>Medical artificial intelligence and human values</article-title><source>N Engl J Med</source><year>2024</year><month>05</month><day>30</day><volume>390</volume><issue>20</issue><fpage>1895</fpage><lpage>1904</lpage><pub-id pub-id-type="doi">10.1056/NEJMra2214183</pub-id><pub-id pub-id-type="medline">38810186</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hall</surname><given-names>KH</given-names> </name></person-group><article-title>Reviewing intuitive decision-making and uncertainty: the implications for medical education</article-title><source>Med Educ</source><year>2002</year><month>03</month><volume>36</volume><issue>3</issue><fpage>216</fpage><lpage>224</lpage><pub-id pub-id-type="doi">10.1046/j.1365-2923.2002.01140.x</pub-id><pub-id pub-id-type="medline">11879511</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sacks</surname><given-names>GD</given-names> </name><name name-style="western"><surname>Dawes</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Tsugawa</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>The association between risk aversion of surgeons and their clinical decision-making</article-title><source>J Surg Res</source><year>2021</year><month>12</month><volume>268</volume><fpage>232</fpage><lpage>243</lpage><pub-id pub-id-type="doi">10.1016/j.jss.2021.06.056</pub-id><pub-id pub-id-type="medline">34371282</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Palenzuela</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Mullen</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Phitayakorn</surname><given-names>R</given-names> </name></person-group><article-title>AI Versus MD: evaluating the surgical decision-making accuracy of ChatGPT-4</article-title><source>Surgery</source><year>2024</year><month>08</month><volume>176</volume><issue>2</issue><fpage>241</fpage><lpage>245</lpage><pub-id pub-id-type="doi">10.1016/j.surg.2024.04.003</pub-id><pub-id pub-id-type="medline">38769038</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Calabrese</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sylla</surname><given-names>P</given-names> </name><etal/></person-group><article-title>The performance of artificial intelligence large language model-linked chatbots in surgical decision-making for gastroesophageal reflux disease</article-title><source>Surg Endosc</source><year>2024</year><month>05</month><volume>38</volume><issue>5</issue><fpage>2320</fpage><lpage>2330</lpage><pub-id pub-id-type="doi">10.1007/s00464-024-10807-w</pub-id><pub-id pub-id-type="medline">38630178</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Nori</surname><given-names>H</given-names> </name><name name-style="western"><surname>King</surname><given-names>N</given-names> </name><name name-style="western"><surname>McKinney</surname><given-names>SM</given-names> </name><etal/></person-group><article-title>Capabilities of GPT-4 on medical challenge problems</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 20, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2303.13375</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maitland</surname><given-names>A</given-names> </name><name name-style="western"><surname>Fowkes</surname><given-names>R</given-names> </name><name name-style="western"><surname>Maitland</surname><given-names>S</given-names> </name></person-group><article-title>Can ChatGPT pass the MRCP (UK) written examinations? Analysis of performance and errors using a clinical decision-reasoning framework</article-title><source>BMJ Open</source><year>2024</year><month>03</month><day>15</day><volume>14</volume><issue>3</issue><fpage>e080558</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2023-080558</pub-id><pub-id pub-id-type="medline">38490655</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riepenhausen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plagwitz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Varghese</surname><given-names>J</given-names> </name></person-group><article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title><source>Nat Commun</source><year>2024</year><month>03</month><day>6</day><volume>15</volume><issue>1</issue><fpage>2050</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id><pub-id pub-id-type="medline">38448475</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nicikowski</surname><given-names>J</given-names> </name><name name-style="western"><surname>Szczepa&#x0144;ski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Miedziaszczyk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kudli&#x0144;ski</surname><given-names>B</given-names> </name></person-group><article-title>The potential of ChatGPT in medicine: an example analysis of nephrology specialty exams in Poland</article-title><source>Clin Kidney J</source><year>2024</year><month>08</month><volume>17</volume><issue>8</issue><fpage>sfae193</fpage><pub-id pub-id-type="doi">10.1093/ckj/sfae193</pub-id><pub-id pub-id-type="medline">39099569</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Toal</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Hill</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Quinn</surname><given-names>MP</given-names> </name><name name-style="western"><surname>McQuarrie</surname><given-names>EP</given-names> </name><name name-style="western"><surname>O&#x2019;Neill</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Maxwell</surname><given-names>AP</given-names> </name></person-group><article-title>An international study of variation in attitudes to kidney biopsy practice</article-title><source>Clin J Am Soc Nephrol</source><year>2025</year><month>03</month><day>1</day><volume>20</volume><issue>3</issue><fpage>377</fpage><lpage>386</lpage><pub-id pub-id-type="doi">10.2215/CJN.0000000607</pub-id><pub-id pub-id-type="medline">39705236</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hogan</surname><given-names>JJ</given-names> </name><name name-style="western"><surname>Mocanu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Berns</surname><given-names>JS</given-names> </name></person-group><article-title>The native kidney biopsy: update and evidence for best practice</article-title><source>Clin J Am Soc Nephrol</source><year>2016</year><month>02</month><day>5</day><volume>11</volume><issue>2</issue><fpage>354</fpage><lpage>362</lpage><pub-id pub-id-type="doi">10.2215/CJN.05750515</pub-id><pub-id pub-id-type="medline">26339068</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Suppadungsuk</surname><given-names>S</given-names> </name><name name-style="western"><surname>Garcia Valencia</surname><given-names>OA</given-names> </name><name name-style="western"><surname>Cheungpasitporn</surname><given-names>W</given-names> </name></person-group><article-title>Integrating retrieval-augmented generation with large language models in nephrology: advancing practical applications</article-title><source>Medicina (Kaunas)</source><year>2024</year><month>03</month><day>8</day><volume>60</volume><issue>3</issue><fpage>445</fpage><pub-id pub-id-type="doi">10.3390/medicina60030445</pub-id><pub-id pub-id-type="medline">38541171</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cheungpasitporn</surname><given-names>W</given-names> </name></person-group><article-title>Assessing the accuracy of ChatGPT on core questions in glomerular disease</article-title><source>Kidney Int Rep</source><year>2023</year><month>08</month><volume>8</volume><issue>8</issue><fpage>1657</fpage><lpage>1659</lpage><pub-id pub-id-type="doi">10.1016/j.ekir.2023.05.014</pub-id><pub-id pub-id-type="medline">37547515</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maursetter</surname><given-names>L</given-names> </name></person-group><article-title>Will ChatGPT be the next nephrologist?</article-title><source>Clin J Am Soc Nephrol</source><year>2024</year><month>01</month><day>1</day><volume>19</volume><issue>1</issue><fpage>2</fpage><lpage>4</lpage><pub-id pub-id-type="doi">10.2215/CJN.0000000000000378</pub-id><pub-id pub-id-type="medline">38048210</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kotanko</surname><given-names>P</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>Artificial intelligence and machine learning in dialysis: ready for prime time?</article-title><source>Clin J Am Soc Nephrol</source><year>2023</year><month>06</month><day>1</day><volume>18</volume><issue>6</issue><fpage>803</fpage><lpage>805</lpage><pub-id pub-id-type="doi">10.2215/CJN.0000000000000089</pub-id><pub-id pub-id-type="medline">36795031</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Xie</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Medical foundation large language models for comprehensive text analysis and beyond</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>5</day><volume>8</volume><issue>1</issue><fpage>141</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01533-1</pub-id><pub-id pub-id-type="medline">40044845</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Leas</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Ayers</surname><given-names>JW</given-names> </name><name name-style="western"><surname>Desai</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dredze</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hogarth</surname><given-names>M</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>DM</given-names> </name></person-group><article-title>Using large language models to support content analysis: A case study of ChatGPT for adverse event detection</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>2</day><volume>26</volume><fpage>e52499</fpage><pub-id pub-id-type="doi">10.2196/52499</pub-id><pub-id pub-id-type="medline">38696245</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kernberg</surname><given-names>A</given-names> </name><name name-style="western"><surname>Gold</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Mohan</surname><given-names>V</given-names> </name></person-group><article-title>Using ChatGPT-4 to create structured medical notes from audio recordings of physician-patient encounters: comparative study</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>22</day><volume>26</volume><fpage>e54419</fpage><pub-id pub-id-type="doi">10.2196/54419</pub-id><pub-id pub-id-type="medline">38648636</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Transcript of large language model responses.</p><media xlink:href="jmir_v27i1e73603_app1.docx" xlink:title="DOCX File, 51 KB"/></supplementary-material></app-group></back></article>