<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e52113</article-id>
      <article-id pub-id-type="pmid">38261378</article-id>
      <article-id pub-id-type="doi">10.2196/52113</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Assessing ChatGPT’s Mastery of Bloom’s Taxonomy Using Psychosomatic Medicine Exam Questions: Mixed-Methods Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>de Azevedo Cardoso</surname>
            <given-names>Taiane</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>XiaoYang</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Gencer</surname>
            <given-names>Adem</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Herrmann-Werner</surname>
            <given-names>Anne</given-names>
          </name>
          <degrees>MME, Prof Dr Med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2413-7047</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Festl-Wietek</surname>
            <given-names>Teresa</given-names>
          </name>
          <degrees>MSc, Dr Rer Nat</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Tübingen Institute for Medical Education</institution>
            <institution>Faculty of Medicine</institution>
            <institution>University of Tübingen</institution>
            <addr-line>Elfriede-Aulhorn-Strasse 10</addr-line>
            <addr-line>Tübingen, 72076 Tübingen</addr-line>
            <country>Germany</country>
            <phone>49 7071 29 73715</phone>
            <email>teresa.festl-wietek@med.uni-tuebingen.de</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1450-1757</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Holderried</surname>
            <given-names>Friederike</given-names>
          </name>
          <degrees>MME, Dr Med</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1828-0920</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Herschbach</surname>
            <given-names>Lea</given-names>
          </name>
          <degrees>MSc</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-6378-5073</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Griewatz</surname>
            <given-names>Jan</given-names>
          </name>
          <degrees>MA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9731-3171</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author">
          <name name-style="western">
            <surname>Masters</surname>
            <given-names>Ken</given-names>
          </name>
          <degrees>Prof Dr</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3425-5020</ext-link>
        </contrib>
        <contrib id="contrib7" contrib-type="author">
          <name name-style="western">
            <surname>Zipfel</surname>
            <given-names>Stephan</given-names>
          </name>
          <degrees>Prof Dr Med</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1659-4440</ext-link>
        </contrib>
        <contrib id="contrib8" contrib-type="author">
          <name name-style="western">
            <surname>Mahling</surname>
            <given-names>Moritz</given-names>
          </name>
          <degrees>Dr Med, MHBA</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-7960-4015</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Tübingen Institute for Medical Education</institution>
        <institution>Faculty of Medicine</institution>
        <institution>University of Tübingen</institution>
        <addr-line>Tübingen</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Psychosomatic Medicine and Psychotherapy</institution>
        <institution>University Hospital Tübingen</institution>
        <addr-line>Tübingen</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>University Department of Anesthesiology and Intensive Care Medicine</institution>
        <institution>University Hospital Tübingen</institution>
        <addr-line>Tübingen</addr-line>
        <country>Germany</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Medical Education and Informatics Department</institution>
        <institution>College of Medicine and Health Sciences</institution>
        <institution>Sultan Qaboos University</institution>
        <addr-line>Muscat</addr-line>
        <country>Oman</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Diabetology, Endocrinology, Nephrology</institution>
        <institution>Section of Nephrology and Hypertension</institution>
        <institution>University Hospital Tübingen</institution>
        <addr-line>Tübingen</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Teresa Festl-Wietek <email>teresa.festl-wietek@med.uni-tuebingen.de</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>23</day>
        <month>1</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e52113</elocation-id>
      <history>
        <date date-type="received">
          <day>23</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>26</day>
          <month>8</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>15</day>
          <month>9</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>7</day>
          <month>12</month>
          <year>2023</year>
        </date>
      </history>
      <copyright-statement>©Anne Herrmann-Werner, Teresa Festl-Wietek, Friederike Holderried, Lea Herschbach, Jan Griewatz, Ken Masters, Stephan Zipfel, Moritz Mahling. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 23.01.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e52113" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models such as GPT-4 (Generative Pre-trained Transformer 4) are being increasingly used in medicine and medical education. However, these models are prone to “hallucinations” (ie, outputs that seem convincing while being factually incorrect). It is currently unknown how these errors by large language models relate to the different cognitive levels defined in Bloom’s taxonomy.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to explore how GPT-4 performs in terms of Bloom’s taxonomy using psychosomatic medicine exam questions.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We used a large data set of psychosomatic medicine multiple-choice questions (N=307) with real-world results derived from medical school exams. GPT-4 answered the multiple-choice questions using 2 distinct prompt versions: detailed and short. The answers were analyzed using a quantitative approach and a qualitative approach. Focusing on incorrectly answered questions, we categorized reasoning errors according to the hierarchical framework of Bloom’s taxonomy.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>GPT-4’s performance in answering exam questions yielded a high success rate: 93% (284/307) for the detailed prompt and 91% (278/307) for the short prompt. Questions answered correctly by GPT-4 had a statistically significant higher difficulty than questions answered incorrectly (<italic>P</italic>=.002 for the detailed prompt and <italic>P</italic>&#60;.001 for the short prompt). Independent of the prompt, GPT-4’s lowest exam performance was 78.9% (15/19), thereby always surpassing the “pass” threshold. Our qualitative analysis of incorrect answers, based on Bloom’s taxonomy, showed that errors were primarily in the “remember” (29/68) and “understand” (23/68) cognitive levels; specific issues arose in recalling details, understanding conceptual relationships, and adhering to standardized guidelines.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>GPT-4 demonstrated a remarkable success rate when confronted with psychosomatic medicine multiple-choice exam questions, aligning with previous findings. When evaluated through Bloom’s taxonomy, our data revealed that GPT-4 occasionally ignored specific facts (remember), provided illogical reasoning (understand), or failed to apply concepts to a new situation (apply). These errors, which were confidently presented, could be attributed to inherent model biases and the tendency to generate outputs that maximize likelihood.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>answer</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>assessment</kwd>
        <kwd>Bloom’s taxonomy</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>classification</kwd>
        <kwd>error</kwd>
        <kwd>exam</kwd>
        <kwd>examination</kwd>
        <kwd>generative</kwd>
        <kwd>GPT-4</kwd>
        <kwd>Generative Pre-trained Transformer 4</kwd>
        <kwd>language model</kwd>
        <kwd>learning outcome</kwd>
        <kwd>LLM</kwd>
        <kwd>MCQ</kwd>
        <kwd>medical education</kwd>
        <kwd>medical exam</kwd>
        <kwd>multiple-choice question</kwd>
        <kwd>natural language processing</kwd>
        <kwd>NLP</kwd>
        <kwd>psychosomatic</kwd>
        <kwd>question</kwd>
        <kwd>response</kwd>
        <kwd>taxonomy</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>The recent developments in artificial intelligence (AI) have transformative potential for various fields, including medicine [<xref ref-type="bibr" rid="ref1">1</xref>] and medical education [<xref ref-type="bibr" rid="ref2">2</xref>]. In November 2022, OpenAI launched GPT-3 (Generative Pre-trained Transformer 3), a large language model (LLM) [<xref ref-type="bibr" rid="ref3">3</xref>]. Its high-quality performance surprised even experts and generated huge public interest (particularly in school and higher education settings, where GPT-3 prompted manifold discussions on its potential benefits and harms) [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>].</p>
      <p>In medical education, LLMs have the potential to revolutionize current teaching approaches and thus ultimately improve physician performance and health care outcomes. However, before LLMs are thoroughly integrated into medical education, their performance in this context must be comprehensively assessed. It is especially important to evaluate the capabilities of AI and LLMs within educational theoretical frameworks.</p>
      <p>One of the most-used frameworks in medical education is Bloom’s taxonomy [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>] of learning outcomes, first introduced in 1956. Briefly, Bloom and subsequent colleagues developed a hierarchical classification of cognitive processes, ordered from lower-order cognitive skills—such as knowledge recall (remember) and comprehension (understand)—to higher-order thinking—such as application (apply), analysis (analyze), evaluation (evaluate), and creation (create) [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
      <p>Since its first publication in 1956, this taxonomy has been used as a common language for educational instructors and still influences the field of medical education [<xref ref-type="bibr" rid="ref8">8</xref>]. With his work, Bloom provided a significant contribution to what is now known as outcome-based education [<xref ref-type="bibr" rid="ref9">9</xref>] and laid the foundation for other educational theories, such as Miller’s pyramid of clinical competencies [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref11">11</xref>]. While Bloom’s taxonomy is widely used and offers a structured approach to learning outcomes, some educators believe that its hierarchical nature might not always represent the complexity of learning [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
      <p>Although derived from human learning processes, Bloom’s taxonomy provides an ideal framework to describe the cognitive processes that underlie success and failure. Recently, LLMs have been assessed for their (surprisingly mostly good) performance in various fields of medicine, ranging from specific subjects to board exams [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. However, the errors made by LLMs have not been evaluated in detail. For example, while LLMs might successfully recall facts (remember), they might struggle to apply those facts to a different context, or vice versa. We acknowledge that applying terms such as “remember” and “struggle” are anthropomorphisms used for ease of reading, as an LLM currently does neither and merely generates responses based on language-usage statistical probabilities using a “next-word prediction paradigm” [<xref ref-type="bibr" rid="ref16">16</xref>].</p>
      <p>Therefore, we aimed to use Bloom’s taxonomy to gain a better understanding of the failures of LLMs. For human medicine education and the aforementioned use cases for LLMs, multiple-choice questions (MCQs) remain a primary written exam form and are used for summative and formative assessments [<xref ref-type="bibr" rid="ref17">17</xref>]. In Bloom’s taxonomy, MCQs are often used to assess lower-order cognitive skills, such as knowledge recall (remember) and comprehension (understand), but they may also probe higher-order thinking, such as application (apply), analysis (analyze), and evaluation (evaluate) [<xref ref-type="bibr" rid="ref18">18</xref>]. Thus, MCQs offer a suitable lens for evaluating different cognitive processes.</p>
      <p>A medical field that relies heavily on language and factual understanding is instrumental to elucidating cognitive processes and correct or incorrect reasoning. Given its interplay of psychological, social, and biological factors, psychosomatic medicine offers such a case. The field’s heavy reliance on verbal and written communication for diagnosis and treatment makes it particularly challenging. Additionally, the combination of strict diagnostic criteria with a nuanced understanding of the patient’s language makes it an ideal testing ground for the capabilities of language models.</p>
      <p>We present a mixed methods study designed to explore how GPT-4 performs in terms of Bloom’s taxonomy. First, we assessed the performance of GPT-4 with a large set of psychosomatic medicine exam questions and compared the results to responses from a cohort of medical students, thereby providing human comparison and quality indicators. For a deeper understanding of the results, we used qualitative methods to comprehend the model’s performance and to assess the strengths and weaknesses of LLMs in relation to Bloom’s taxonomy. The findings of this study provide critical insights into the practical applications and limitations of LLMs (such as GPT-4) in medical education.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Exams</title>
        <p>A total of 16 examinations from winter term 2014-2015 to summer term 2022 were retrieved from the integrated management system of the Department of Psychosomatic Medicine and Psychotherapy faculty’s web-based exam program (<xref rid="figure1" ref-type="fig">Figure 1</xref> provides a graphical illustration of our methodological approach). In addition to question stems, answers, and distractors, the system also offers quality criteria for each individual question.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Illustration of our methodological approach. API: application programming interface; GPT-4: Generative Pre-trained Transformer 4; MCQ: multiple-choice question.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e52113_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Each examination consists of 20 MCQs, with each MCQ having 1 answer and 4 distractors. Diagnostic and therapeutic questions cover topics concerning anxiety disorders, depression, eating, and somatoform and trauma disorders. The examinations also include questions concerning motivational interviewing techniques.</p>
        <p>To compare the performance of GPT-4 to the student’s performance, we retrieved the level of difficulty from the system, calculated as the average score attained in the respective question. Undergraduate medical students take the examination containing these questions at the end of their third year, after having attended 7.5 hours of lectures and 18 hours of practical classes on psychosomatic medicine and psychotherapy. To pass, a student is usually required to answer 12 of 20 questions correctly; an adjustment of this passing score is possible if, for example, a question has a too-poor performance.</p>
        <p>We assessed the questions for their suitability to be analyzed by GPT-4. From a total of 320 questions, 13 (4%) were excluded, including questions that were not single-choice answers (n=6), ambiguous questions (n=3), questions featuring a graphic that had to be analyzed (n=2), and questions that covered a case represented in multiple questions (n=2).</p>
        <p>We used GPT-4 to answer every question (model “gpt-4,” OpenAI LP). For each question, we generated a detailed prompt version and a short prompt version. The prompts were created by the authors in an iterative process using the web interface ChatGPT Plus to achieve ideal performance. The most relevant difference between the versions was that the detailed prompt included a command to critically reflect on the answer and justify the choice made. <xref ref-type="table" rid="table1">Table 1</xref> provides an example of a detailed prompt. We used the application programming interface (API) provided by OpenAI to post the questions to GPT-4 and retrieve the answers (access dates were March 21 and March 22, 2023). Each question and answer was posted in its own session. All interactions with GPT-4 were conducted in German, the original language of the examination questions; for the purposes of this paper, the questions were manually translated into English.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Examples of detailed and short prompt versions.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="800"/>
            <thead>
              <tr valign="top">
                <td>Prompt version</td>
                <td>Example</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Detailed prompt</td>
                <td>You have all the knowledge of psychosomatic medicine and have to answer an exam question. Please elaborate on the following multiple-choice question. Only one of the five answer choices is correct. Please consider carefully and choose one answer. Give a detailed reason for your answer. At the end of the reasoning, please add the letter of the answer you chose with the following notation: !A! (that is, A between two exclamation marks if you chose answer A). The question reads: A 49-year-old teacher is undergoing rehabilitation treatment after anterior wall infarction. It has been observed that he skips therapy appointments. The wife is concerned because he shows no interest in her visits. When asked, he states that he sees no point in further treatment because he has little hope that his condition will improve. However, none of that really matters anymore. He is also not afraid of another heart attack; on the contrary, then his misery would come to an end. Which suspected diagnosis in the psychosomatic field are you most likely to make? Answer A: Unspecific somatisation disorder; Answer B: Post-traumatic stress disorder; Answer C: Pseudodementia; Answer D: Generalised anxiety disorder; Answer E: Depression.</td>
              </tr>
              <tr valign="top">
                <td>Short prompt</td>
                <td>Please act like a specialist in psychosomatic medicine. Answer the following multiple-choice question and briefly explain your answer: A 49-year-old teacher is undergoing rehabilitation treatment after anterior wall infarction. It has been observed that he skips therapy appointments. The wife is concerned because he shows no interest in her visits. When asked, he states that he sees no point in further treatment because he has little hope that his condition will improve. However, none of that really matters anymore. He is also not afraid of another heart attack; on the contrary, then his misery would come to an end. Which suspected diagnosis in the psychosomatic field are you most likely to make? Answer A: Unspecific somatisation disorder; Answer B: Post-traumatic stress disorder; Answer C: Pseudodementia; Answer D: Generalised anxiety disorder; Answer E: Depression.</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <p>The responses given by GPT-4 were compared to the answers indicated by the answer index (eg, “A” or “C”) and stored in Excel (version 16.0.10394.20022; Microsoft Corporation).</p>
        <sec>
          <title>Quantitative Data Analysis</title>
          <p>Quantitative analyses and figure generation were performed using R (R version 4.3.1; R Core Team) statistical software [<xref ref-type="bibr" rid="ref19">19</xref>]. Briefly, we combined all tables with relevant data—that is, answers from GPT-4 and the aggregated data of the students’ exams (such as item difficulty). For each prompt version, we analyzed the ratio of correctly answered questions versus incorrectly answered questions. We further compared the question difficulty (taken from the aggregated student data) across questions answered correctly and incorrectly by GPT-4. The difficulty of a question is operationalized as the proportion of students answering a question correctly, with 0 representing a very difficult question and 1 a very easy question [<xref ref-type="bibr" rid="ref20">20</xref>]. A Wilcoxon rank sum test was used to test for statistical significance. A level of <italic>P</italic>&#60;.05 was considered statistically significant. If not stated otherwise, the results are given in medians and IQRs.</p>
        </sec>
        <sec>
          <title>Qualitative Data Analysis</title>
          <p>A total of 2 authors (TFW and FH) separately coded each text response. The answers from GPT-4 were analyzed inductively and iteratively according to Mayring’s [<xref ref-type="bibr" rid="ref21">21</xref>] qualitative content analysis, as described previously by our group [<xref ref-type="bibr" rid="ref22">22</xref>]. The goal of the analysis was defined in line with the answers to the examination questions. For the main category, we used the correct or incorrect answer to the question, then further focused primarily on incorrect answers.</p>
          <p>In the answer texts, individual reasoning was categorized according to Bloom’s taxonomy as revised by Krathwohl [<xref ref-type="bibr" rid="ref8">8</xref>]. Briefly, we used the following definitions of the cognitive domains for our rating procedure:</p>
          <list list-type="order">
            <list-item>
              <p>Remember: retrieving relevant knowledge from long-term memory.</p>
            </list-item>
            <list-item>
              <p>Understand: determining the meaning of instructional messages, including oral, written, and graphic communication.</p>
            </list-item>
            <list-item>
              <p>Apply: carrying out or using a procedure in a given situation.</p>
            </list-item>
            <list-item>
              <p>Analyze: breaking down material into its constituent parts and detecting how the parts relate to 1 another and to an overall structure or purpose.</p>
            </list-item>
            <list-item>
              <p>Evaluate: making judgments based on criteria and standards.</p>
            </list-item>
          </list>
          <p>In the second step, each of the raters coded the answers using MAXQDA (version 12.3.2; VERBI software). To obtain the same level of abstraction when building the categories, the raters revised the codes together and agreed on the final categories, paraphrasing representative examples and building a hierarchy of categories based on the found codes. Subsequently, both initial raters independently worked through the material again. Each rater individually analyzed the answers given by GPT-4 and built codes using MAXQDA, including the main classification (correct or wrong answer), followed by the category of Bloom’s taxonomy and an example. When they could not agree on a category, 2 other experts were consulted in order to reach a consensus. When GPT-4’s responses were wrong, the explanation was analyzed using the levels of Bloom’s taxonomy (remember, understand, apply, analyze, evaluate, and create) [<xref ref-type="bibr" rid="ref8">8</xref>].</p>
        </sec>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>The Ethics Committee of the Faculty of Medicine at University Hospital Tübingen approved the study (number 076/2023A). All data were kept anonymous and were not associated with individual students.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Quantitative Results</title>
        <sec>
          <title>Distribution of Correctly and Incorrectly Answered Questions</title>
          <p>For the detailed prompt, GPT-4 answered 92.5% (284/307) of the questions correctly; for the short prompt, the success rate was 90.6% (278/307). The distribution is shown in <xref rid="figure2" ref-type="fig">Figure 2</xref>.</p>
          <fig id="figure2" position="float">
            <label>Figure 2</label>
            <caption>
              <p>Distribution of correctly and incorrectly answered questions by prompt version.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e52113_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Question Difficulty</title>
          <p>Across all questions, the median difficulty was 0.892 (IQR 0.705-0.949). The distribution of the question difficulty for correctly and incorrectly answered questions is displayed in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
          <p>For the detailed prompt, questions answered correctly had a higher difficulty (median 0.900, IQR 0.737-0.952) compared with questions answered incorrectly (median 0.705, IQR 0.380-0.885). This difference was statistically significant (<italic>P</italic>=.002).</p>
          <p>In the analysis of the short prompt, we also found a lower difficulty for incorrectly answered questions (median 0.708, IQR 0.500-0.864) compared with correctly answered questions (median 0.904, IQR 0.741-0.953). Here as well, a significant difference was detected between the correctly and incorrectly answered questions (<italic>P</italic>&#60;.001).</p>
          <fig id="figure3" position="float">
            <label>Figure 3</label>
            <caption>
              <p>Question difficulty by prompt version and answer correctness.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e52113_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
        <sec>
          <title>Exam Scores</title>
          <p>We further analyzed the performance of GPT-4 for all 16 individual exams (<xref rid="figure4" ref-type="fig">Figure 4</xref>). Regardless of the prompt version, GPT-4 never scored below 78.9% and thus always passed the exams. Furthermore, 3 exams (exam “WS17/18” for both prompt versions and exams “SS16” and “SS19” for the detailed prompt only) were passed with a score of 100%.</p>
          <fig id="figure4" position="float">
            <label>Figure 4</label>
            <caption>
              <p>Performance of GPT-4 for all relevant terms for the detailed prompt (red) and short prompt (blue). GPT-4: Generative Pre-trained Transformer 4.</p>
            </caption>
            <graphic xlink:href="jmir_v26i1e52113_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
          </fig>
        </sec>
      </sec>
      <sec>
        <title>Qualitative Results</title>
        <p>After the quantitative results were obtained, we analyzed the answers given by GPT-4 in detail using content analysis.</p>
        <sec>
          <title>Correctly Answered Questions</title>
          <p>When GPT-4 answered a question correctly, it provided either a detailed or short explanation (<xref ref-type="table" rid="table2">Table 2</xref>). In the detailed version, GPT-4 also commented on the distractors. For correctly answered questions, the content analysis revealed logical reasoning and the correct use of facts. With regard to the facts cited, no answers were found to be either implausible or incorrect.</p>
          <table-wrap position="float" id="table2">
            <label>Table 2</label>
            <caption>
              <p>Correct answers by GPT-4 (Generative Pre-trained Transformer 4) to the question described in <xref ref-type="table" rid="table1">Table 1</xref> for the detailed and short prompts.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="150"/>
              <col width="850"/>
              <thead>
                <tr valign="top">
                  <td>Prompt version</td>
                  <td>Correct answer</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Detailed prompt</td>
                  <td>Explanation: After careful consideration of the various answer options and taking into account all the knowledge of psychosomatic medicine, answer E: Depression is the most plausible.<break/>In this case, the 49-year-old teacher shows clear signs of emotional impairment after an anterior wall infarction. These include disinterest in therapy appointments, lack of interest in his wife’s visits, hopelessness about his recovery, and no sense of fear of another heart attack – on the contrary, he sees the prospect of an end to his “misery” as a positive thing. All these signs point to a possible depressive episode.<break/>Answer A: Nonspecific somatization disorder can be ruled out because the patient does not have a multitude of physical symptoms without a demonstrable organic cause.<break/>Answer B: Post-traumatic stress disorder could be considered due to the event of the infarction, but the typical symptoms such as memories or flashbacks of the traumatic event, avoidance behaviour, and increased irritability are missing in the description of the case.<break/>Answer C: Pseudodementia refers to cognitive impairment caused by depressive symptoms rather than actual dementia. Although this would be possible in the case of depression, the question does not provide enough information about cognitive impairment to support this diagnosis.<break/>Answer D: Generalized Anxiety Disorder does not fit the case as the patient states that he is not afraid of having another heart attack and overall does not show excessive fear or anxiety.<break/>Therefore, answer E: Depression is the most plausible tentative diagnosis in the psychosomatic field in this case.</td>
                </tr>
                <tr valign="top">
                  <td>Short prompt</td>
                  <td>Answer E: Depression<break/>The symptoms described, such as lack of interest in further treatment, lack of hope for improvement, dejection and even death wishes, most likely indicate depression. The patient shows signs of anhedonia (inability to feel pleasure or interest in activities) and negative thought patterns. In such cases, psychosomatic assessment and appropriate treatment is important to aid the patient’s recovery.</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
        </sec>
        <sec>
          <title>Incorrectly Answered Questions</title>
          <p>When a question was answered incorrectly by GPT-4, the explanation’s reasoning was analyzed using Bloom’s taxonomy, with its domains “remember,” “understand,” “apply,” “analyze,” “evaluate,” and “create” (<xref ref-type="table" rid="table3">Table 3</xref>). The most mistakes were found to be related to “remember,” followed by “understand” and “apply,” and only 1 erroneous reasoning example was found for “evaluate.” In the following sections, examples for each category are given in detail.</p>
          <table-wrap position="float" id="table3">
            <label>Table 3</label>
            <caption>
              <p>Count of incorrect reasonings by GPT-4 (Generative Pre-trained Transformer 4) for each category of Bloom’s taxonomy. For this analysis, all answers (detailed prompts and short prompts) were analyzed together.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="500"/>
              <col width="500"/>
              <thead>
                <tr valign="top">
                  <td>Category of Bloom’s taxonomy</td>
                  <td>Count (n=68), n</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Remember</td>
                  <td>29</td>
                </tr>
                <tr valign="top">
                  <td>Understand</td>
                  <td>23</td>
                </tr>
                <tr valign="top">
                  <td>Apply</td>
                  <td>15</td>
                </tr>
                <tr valign="top">
                  <td>Analyze</td>
                  <td>0</td>
                </tr>
                <tr valign="top">
                  <td>Evaluate</td>
                  <td>1</td>
                </tr>
                <tr valign="top">
                  <td>Create</td>
                  <td>0</td>
                </tr>
              </tbody>
            </table>
          </table-wrap>
          <sec>
            <title>Bloom’s Taxonomy: Remember</title>
            <p>This section provides an example of a question for which GPT-4 failed to remember facts. The question presented the case of a man with recurrent physical complaints without organic cause; the complaints persisted for 10 months, making it appropriate for a diagnosis of “undifferentiated somatoform disorder” (F45.1 according to the <italic>International Classification of Diseases, 10th Revision</italic> [ICD-10]). A “somatization disorder” (F45.0) cannot be chosen, as this requires “symptoms of at least 2 years duration” [<xref ref-type="bibr" rid="ref23">23</xref>].</p>
            <p>However, GPT-4 incorrectly chose “somatization disorder.” In its reasoning, GPT-4 stated:</p>
            <disp-quote>
              <p>…I would choose answer C: Somatization disorder (F45.0) as the most appropriate diagnosis for this patient. Somatization disorder (F45.0) is characterized by persistent, multiple, and recurrent physical complaints for which no sufficient organic cause can be found. The symptoms can occur in different organ or functional systems and are usually severe enough to interfere with the patient’s daily life.</p>
            </disp-quote>
            <p>In its recall, GPT-4 did not mention the required duration of the symptoms stated in the ICD-10. Thus, from what can be observed in the answer, the concept used by GPT-4 missed at least 1 important fact, and this failure could be attributed to GPT-4 not “remembering.”</p>
          </sec>
          <sec>
            <title>Bloom’s Taxonomy: Understand</title>
            <p>An example of GPT-4 not understanding some basic concepts can be found in a question concerning anorexia nervosa. The answer relates to the fact that anorexia nervosa can be accompanied by transient hypercholesterolemia, which does not necessitate treatment [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref25">25</xref>]. However, GPT-4 picked one of the distractors, which states that the basic metabolic rate increases during phases of low body weight and decreases after refeeding [<xref ref-type="bibr" rid="ref26">26</xref>]—but the opposite is true. In its explanation, GPT-4 thought about these facts (and even remembered them correctly in sentence 3!) but did not seem to understand how they relate and interact.</p>
            <disp-quote>
              <p>In patients with anorexia nervosa, the basal metabolic rate is initially elevated at the onset of weight gain and decreases dramatically during refeeding. Food intake should be adjusted accordingly. Basal metabolic rate here refers to the amount of energy the body needs at rest to maintain its basic functions. In people with anorexia nervosa, the body is initially very efficient at obtaining energy due to persistent nutrient deficiency. However, as refeeding progresses, the basal metabolic rate drops dramatically as the body re-learns to store energy.</p>
            </disp-quote>
          </sec>
          <sec>
            <title>Bloom’s Taxonomy: Apply</title>
            <p>The third level of Bloom’s taxonomy involves the application of previously learned content to other areas. An error in this domain was evident when we asked GPT-4 a question about the diagnosis and treatment of a depressive episode. The correct answer was that symptoms or criteria must be present for at least 14 days, which is clearly required by the ICD-10 [<xref ref-type="bibr" rid="ref23">23</xref>]. However, GPT-4 rates this answer as incorrect and implies that this time frame is more of a rough guideline than a strict minimum requirement.</p>
            <disp-quote>
              <p>In the diagnosis of depressive episodes, the time criterion is an important factor. However, many experts believe that a period of at least two weeks is rather a rough guideline. Some patients can also go through a depressive episode in a shorter time or fulfil the criteria for a longer time.</p>
            </disp-quote>
            <p>This reveals an issue at the “application” level of Bloom’s taxonomy. GPT-4 seemed to “remember” the fact about the temporal criterion and “understood” its concept. However, it struggled with the “application” of this rule, proposing flexibility where there is conventional rigidity. This can be interpreted as a misapplication of the learned rule and, thus, a mistake made at the application level of Bloom’s taxonomy.</p>
          </sec>
          <sec>
            <title>Bloom’s Taxonomy: Evaluate</title>
            <p>GPT-4 made a mistake at the “evaluate” level of Bloom’s taxonomy in a more complex question testing one’s judgment regarding the ideal therapy for a patient. Due to the complexity, the question is presented here in full:</p>
            <disp-quote>
              <p>A 57-year-old ready-to-wear clothing-maker is referred to the psychosomatic outpatient clinic by her family doctor because of persistent back pain. She reports “terrible” pain in her lower back. She has been unable to work for seven months because of it and can hardly sleep. No one was able to help her; the family doctor just sent her from doctor to doctor. (The findings from orthopaedics, neurology, neurosurgery, and radiology that she brought with her show minor disc changes on several levels, but no neurological deficits). She does not want to undergo surgery because “they probably could not earn enough money from her as a patient with public insurance”. Now the family doctor has prescribed her an antidepressant, which she will not take. She still has hope for improvement, but if it continues like this, she does not know whether she will put an end to her life. What is the first course of action indicated?</p>
            </disp-quote>
            <disp-quote>
              <p>A: Immediate admission to a psychiatric ward, against the patient’s will if necessary.</p>
            </disp-quote>
            <disp-quote>
              <p>B: Initiate inpatient psychosomatic treatment.</p>
            </disp-quote>
            <disp-quote>
              <p>C: Observation of the symptoms under medication with NSAIDs.</p>
            </disp-quote>
            <disp-quote>
              <p>D: Obtain a neurosurgical second opinion after current imaging.</p>
            </disp-quote>
            <disp-quote>
              <p>E: Clarification of secondary disease gain and immediate termination of incapacity for work.</p>
            </disp-quote>
            <p>When we used the short prompt, GPT-4 chose the answer B: Plan a psychosomatic inpatient treatment. In its brief explanation, GPT-4 correctly judged why this was the most likely answer.</p>
            <p>However, when we used the detailed prompt, GPT-4 chose the distractor E and sent the patient back to work effectively immediately, arguing:</p>
            <disp-quote>
              <p>After careful consideration, answer E is most appropriate, but with modification. A detailed explanation of psychosomatic correlations and the possible role of secondary illness gain might help the patient better understand her situation and is thus indicated. However, immediate cessation of incapacity should not be forced, as this could potentially exacerbate the patient’s pain and frustration. It would make sense to first seek outpatient psychosomatic treatment to help the patient cope with her pain and focus on long-term recovery.</p>
            </disp-quote>
            <p>In this example, it is evident that GPT-4 knows the facts (remember), understands them (understand), applies them to this situation (apply), and correctly analyzes the situation (analyze). GPT-4 can even be observed to evaluate, but it fails at evaluating the consequences of its decision. While GPT-4 seems to be aware of this and wants to modify the given answer, it incorrectly evaluates the answer options and does not recommend inpatient psychosomatic treatment.</p>
          </sec>
        </sec>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>Tools based on LLMs are rapidly evolving and influencing medicine [<xref ref-type="bibr" rid="ref1">1</xref>] and medical education [<xref ref-type="bibr" rid="ref2">2</xref>]. In this study, we show that GPT-4 yields high accuracy in answering psychosomatic medicine questions, with success rates well above 90%. This confirms results from other studies, which also show high exam scores in, for example, the USMLE (United States Medical Licensing Examination) [<xref ref-type="bibr" rid="ref14">14</xref>] and other board exams [<xref ref-type="bibr" rid="ref27">27</xref>]. Although we detected high correct-answer rates for both our detailed prompt and short prompt versions, GPT-4 answered more questions correctly when we used the detailed prompt (284/307, 92.5% vs 278/307, 90.6%). While this study was not designed to assess reasons for this difference, 2 likely explanations for it deserve attention. First, the difference rates for the 2 prompt versions could represent a true difference in the LLM’s performance with respect to the prompt version [<xref ref-type="bibr" rid="ref28">28</xref>]. Second, the difference could be due to a random variation that is known to occur even when the same prompt is used more than once [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>].</p>
        <p>We were also interested in GPT-4’s performance compared with that of medical students. Here, our analysis revealed that the questions answered correctly by GPT-4 were significantly easier than the questions that were incorrectly answered. This difference could be observed for both the detailed prompt and short prompt versions. For further comparison, it should be noted that question difficulty is not a fixed or static variable but rather is dependent and calculated on the basis of the responses of human students [<xref ref-type="bibr" rid="ref20">20</xref>].</p>
        <p>However, in order to understand why GPT-4 fails at some questions, we further analyzed incorrectly answered questions using a qualitative approach. It is well known that incorrect or inaccurate information is an important issue with LLMs [<xref ref-type="bibr" rid="ref31">31</xref>]. Bloom’s taxonomy has emerged as a frequently used standard to describe the cognitive process underlying learning [<xref ref-type="bibr" rid="ref8">8</xref>]. To the best of our knowledge, the levels in Bloom’s taxonomy at which GPT-4 commits cognitive errors have not yet been elucidated. Thus, we performed a detailed assessment of the answers and reasoning provided by GPT-4.</p>
        <p>In our analysis, we found that most errors were made at the lowest level of Bloom’s taxonomy, labeled “remember.” In these answers, GPT-4 failed at naming or using a specific fact, as evident in the text response. In the example presented in the previous section, GPT-4 named most of the diagnostic criteria for a somatization disorder but did not mention the time criterion. In this context, it is important to note that GPT-4 has been trained with publicly available and licensed data (although these are not specified in detail by OpenAI) [<xref ref-type="bibr" rid="ref32">32</xref>]. The information needed in this example is publicly available in the ICD-10 [<xref ref-type="bibr" rid="ref23">23</xref>] and thus is expected to be included in the GPT-4 training data. Indeed, when asked by the authors, ChatGPT (using GPT-4) states that its training data include details on the ICD-10.</p>
        <p>In a recent study, Johnson et al [<xref ref-type="bibr" rid="ref33">33</xref>] evaluated ChatGPT for its accuracy in providing medical information. Using a quantitative approach, they found that GPT-3.5 provided medical answers ranging from “mostly correct” to “almost correct.” It is important to note that these results were generated using GPT-3.5, an older model than the model we used: GPT-4 is claimed to be “40% more likely to produce factual responses than GPT-3.5” [<xref ref-type="bibr" rid="ref32">32</xref>] and exhibits better performance in medical exams [<xref ref-type="bibr" rid="ref27">27</xref>]. However, the results reported by Johnson et al [<xref ref-type="bibr" rid="ref33">33</xref>] are in line with our findings: while GPT-4 uses most facts correctly and completely, it sometimes fails with respect to specific details. In psychosomatic medicine, we observed this to be a diagnostic criterion; while this can be an issue, missing some specific facts in other areas can make all the difference for patient outcomes. Thus, it is important for those using GPT-4 in medicine to keep in mind that specific facts can be wrong or missing.</p>
        <p>Some errors were found at the second level of Bloom’s taxonomy, labeled “understand.” While GPT-4 generally showed good reasoning capability [<xref ref-type="bibr" rid="ref32">32</xref>] and errors were sparse, we were surprised that some answers yielded obvious logical flaws, as seen in the example from the previous section. In its response, GPT-4 confidently presents a set of sentences that do not correlate logically. Mechanistically, language models such as GPT use likelihood maximization, generating text based on what most likely follows [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref34">34</xref>]. However, this approach can result in what is called hallucinations, or “content that is nonsensical or unfaithful to the provided source content” [<xref ref-type="bibr" rid="ref34">34</xref>]<italic>.</italic> The resulting medical information might sound very confident but be incorrect [<xref ref-type="bibr" rid="ref35">35</xref>], thereby posing a significant threat for medical applications [<xref ref-type="bibr" rid="ref5">5</xref>]. This raises ethical concerns around the use of AI systems for patient-related work, particularly as GPT-4’s algorithms and ethical models are unknown and variable [<xref ref-type="bibr" rid="ref4">4</xref>]. Because GPT-4 is not considered to be sentient, it neither knows nor cares about the accuracy of its responses.</p>
        <p>We also detected some mistakes that represented Bloom’s taxonomy level labeled “application.” In our representative example presented in the previous section, GPT-4 was quite flexible in applying a very strict time criterion. This can be interpreted in the context of the process of training LLMs. Although little has been published about this process, classification systems probably represent only a small amount of the data available on a certain subject. It can be further assumed that information designed for the public might not be as specific as strict diagnostic criteria because it serves another audience. Thus, following GPT-4’s likelihood maximization approach, LLMs might neglect a specific but likely underrepresented piece of information. Furthermore, GPT-4 has been observed to perform poorly in pure calculation tasks [<xref ref-type="bibr" rid="ref35">35</xref>], probably also challenging strictly numerical criteria. This is not entirely surprising, as GPT-4 is a LLM (with an emphasis on language) and is not intended to be used as a calculator.</p>
        <p>We found only 1 mistake that could be classified at the Bloom’s taxonomy level labeled “evaluate,” in which GPT-4 incorrectly judged a medically complex situation.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has some limitations that merit discussion. First, we used Bloom’s taxonomy. While it provides good operationalization for cognitive processes, the taxonomy represents a continuum wherein more than 1 level can be activated in a single question [<xref ref-type="bibr" rid="ref8">8</xref>]. Nevertheless, we observed that most errors could be attributed to only 1 cognitive level. Second, we chose questions from psychosomatic medicine because many aspects of this field can be covered by written language and do not require images or many numbers. However, psychosomatic medicine is a specialty in which treatment can be individualized and especially complex, making categorical judgment harder and possibly reducing generalizability beyond this field. Third, since we only used GPT-4 as our LLM, we cannot judge if our implications hold true for other models. Fourth, our data were acquired 1 time, at a specific date. As the performance of GPT-4 varies over time, this could reduce generalizability [<xref ref-type="bibr" rid="ref36">36</xref>]. Finally, in this study, we compared GPT-4’s performance with that of medical students by using the difficulty index of each exam question. It is important to note that this difficulty index is specific to the particular cohort of medical students who participated in the representative exam.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In summary, we found that GPT-4 performs extremely well on psychosomatic medicine questions. Questions answered correctly by GPT-4 were also easier for human students than questions answered incorrectly, as shown by the level of question difficulty. When analyzing the mistakes of GPT-4, we found that most errors corresponded to lower-order cognitive levels, particularly “remember” and “understand.” While we found some mistakes for “apply,” very few or no errors were found for “analyze” and “evaluate” (“create” could not be assessed in this study). To the best of our knowledge, this study is the first to describe the cognitive levels at which GPT-4 makes mistakes in the context of psychosomatic medicine.</p>
        <p>This study has important implications. First, GPT-4 is already capable of answering many questions in (psychosomatic) medicine, and thus, should the technology be made available, it could reduce the effectiveness of summative assessment. Second, GPT-4 sometimes fails at exact facts, correct understanding, and application of knowledge; however, without exact knowledge, these failures are hard to recognize. Thus, the output generated by GPT-4 must be checked for accuracy, especially in those domains. Our research can also help in model training, and future studies can use our results to correlate model training and LLM outcomes.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group/>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GPT</term>
          <def>
            <p>Generative Pre-trained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ICD-10</term>
          <def>
            <p>International Classification of Diseases, 10th Revision</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">MCQ</term>
          <def>
            <p>multiple-choice question</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">USMLE</term>
          <def>
            <p>United States Medical Licensing Examination</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors would like to acknowledge the support of the Open Access Publishing Fund of the University of Tübingen.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets used and/or analyzed during this study are available from the corresponding author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>AHW and MM were responsible for designing and conducting the study, as well as the acquisition, analysis, and interpretation of data. AHW and MM also drafted the first version of the manuscript. TFW, FH, and LH were involved in data analyses and interpretation. JG, KM, and SZ made substantial contributions to the study design. All authors critically revised the manuscript, and all authors approved the final version of the manuscript and agreed to be accountable for all aspects of the work.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Explainability and artificial intelligence in medicine</article-title>
          <source>Lancet Digit Health</source>
          <year>2022</year>
          <volume>4</volume>
          <issue>4</issue>
          <fpage>e214</fpage>
          <lpage>e215</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2589-7500(22)00029-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/S2589-7500(22)00029-2</pub-id>
          <pub-id pub-id-type="medline">35337639</pub-id>
          <pub-id pub-id-type="pii">S2589-7500(22)00029-2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eysenbach</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>The role of ChatGPT, generative language models, and artificial intelligence in medical education: a conversation with ChatGPT and a call for papers</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e46885</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e46885/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/46885</pub-id>
          <pub-id pub-id-type="medline">36863937</pub-id>
          <pub-id pub-id-type="pii">v9i1e46885</pub-id>
          <pub-id pub-id-type="pmcid">PMC10028514</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <article-title>ChatGPT: optimizing language models for dialogue</article-title>
          <source>OpenAI</source>
          <year>2022</year>
          <access-date>2023-08-07</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://web.archive.org/web/20221130180912/https://openai.com/blog/chatgpt/">https://web.archive.org/web/20221130180912/https://openai.com/blog/chatgpt/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Masters</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Ethical use of artificial intelligence in health professions education: AMEE guide no. 158</article-title>
          <source>Med Teach</source>
          <year>2023</year>
          <volume>45</volume>
          <issue>6</issue>
          <fpage>574</fpage>
          <lpage>584</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/full/10.1080/0142159X.2023.2186203"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/0142159X.2023.2186203</pub-id>
          <pub-id pub-id-type="medline">36912253</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Petro</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Benefits, limits, and risks of GPT-4 as an AI chatbot for medicine</article-title>
          <source>N Engl J Med</source>
          <year>2023</year>
          <volume>388</volume>
          <issue>13</issue>
          <fpage>1233</fpage>
          <lpage>1239</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.nejm.org/doi/10.1056/NEJMsr2214184"/>
          </comment>
          <pub-id pub-id-type="doi">10.1056/NEJMsr2214184</pub-id>
          <pub-id pub-id-type="medline">36988602</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bloom</surname>
              <given-names>BS</given-names>
            </name>
            <name name-style="western">
              <surname>Engelhart</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Furst</surname>
              <given-names>EJ</given-names>
            </name>
            <name name-style="western">
              <surname>Hill</surname>
              <given-names>WH</given-names>
            </name>
            <name name-style="western">
              <surname>Krathwohl</surname>
              <given-names>DR</given-names>
            </name>
          </person-group>
          <source>Taxonomy of Educational Objectives: The Classification of Educational Goals: Handbook I: Cognitive Domain</source>
          <year>1956</year>
          <publisher-loc>London</publisher-loc>
          <publisher-name>Longman Green &#38; Co</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Anderson</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Krathwohl</surname>
              <given-names>DR</given-names>
            </name>
          </person-group>
          <source>A Taxonomy for Learning, Teaching, and Assessing: A Revision of Bloom's Taxonomy of Educational Objectives: Complete Edition</source>
          <year>2001</year>
          <publisher-loc>New York</publisher-loc>
          <publisher-name>Longman</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Krathwohl</surname>
              <given-names>DR</given-names>
            </name>
          </person-group>
          <article-title>A revision of Bloom's Taxonomy: an overview</article-title>
          <source>Theory Into Practice</source>
          <year>2010</year>
          <volume>41</volume>
          <issue>4</issue>
          <fpage>212</fpage>
          <lpage>218</lpage>
          <pub-id pub-id-type="doi">10.1207/s15430421tip4104_2</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Harden</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Outcome-based education: the future is today</article-title>
          <source>Med Teach</source>
          <year>2007</year>
          <volume>29</volume>
          <issue>7</issue>
          <fpage>625</fpage>
          <lpage>629</lpage>
          <pub-id pub-id-type="doi">10.1080/01421590701729930</pub-id>
          <pub-id pub-id-type="medline">18236247</pub-id>
          <pub-id pub-id-type="pii">787703923</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>GE</given-names>
            </name>
          </person-group>
          <article-title>The assessment of clinical skills/competence/performance</article-title>
          <source>Acad Med</source>
          <year>1990</year>
          <volume>65</volume>
          <issue>9 Suppl</issue>
          <fpage>S63</fpage>
          <lpage>S67</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://journals.lww.com/academicmedicine/abstract/1990/09000/the_assessment_of_clinical.45.aspx"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/00001888-199009000-00045</pub-id>
          <pub-id pub-id-type="medline">2400509</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>DCM</given-names>
            </name>
            <name name-style="western">
              <surname>Hamdy</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Adult learning theories: implications for learning and teaching in medical education: AMEE guide no. 83</article-title>
          <source>Med Teach</source>
          <year>2013</year>
          <volume>35</volume>
          <issue>11</issue>
          <fpage>e1561</fpage>
          <lpage>e1572</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/full/10.3109/0142159X.2013.828153"/>
          </comment>
          <pub-id pub-id-type="doi">10.3109/0142159X.2013.828153</pub-id>
          <pub-id pub-id-type="medline">24004029</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Horne</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Learning hierarchies: a critique</article-title>
          <source>Educ Psychol</source>
          <year>2006</year>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>63</fpage>
          <lpage>77</lpage>
          <pub-id pub-id-type="doi">10.1080/0144341830030106</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gilson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Safranek</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Socrates</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Taylor</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Chartash</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>How does ChatGPT perform on the United States Medical Licensing Examination? the implications of large language models for medical education and knowledge assessment</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e45312</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e45312/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/45312</pub-id>
          <pub-id pub-id-type="medline">36753318</pub-id>
          <pub-id pub-id-type="pii">v9i1e45312</pub-id>
          <pub-id pub-id-type="pmcid">PMC9947764</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kung</surname>
              <given-names>TH</given-names>
            </name>
            <name name-style="western">
              <surname>Cheatham</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Medenilla</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sillos</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>de Leon</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Elepaño</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Madriaga</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aggabao</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Diaz-Candido</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Maningo</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tseng</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title>
          <source>PLOS Digit Health</source>
          <year>2023</year>
          <volume>2</volume>
          <issue>2</issue>
          <fpage>e0000198</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36812645"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id>
          <pub-id pub-id-type="medline">36812645</pub-id>
          <pub-id pub-id-type="pii">PDIG-D-22-00371</pub-id>
          <pub-id pub-id-type="pmcid">PMC9931230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Gong</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT performs on the Chinese National Medical Licensing Examination</article-title>
          <source>J Med Syst</source>
          <year>2023</year>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <pub-id pub-id-type="doi">10.1007/s10916-023-01961-0</pub-id>
          <pub-id pub-id-type="medline">37581690</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10916-023-01961-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bubeck</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chandrasekaran</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Eldan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gehrke</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Horvitz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Kamar</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>YT</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Lundberg</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Nori</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Palangi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Ribeiro</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Sparks of artificial general intelligence: early experiments with GPT-4</article-title>
          <source>arXiv. Preprint posted online on March 22, 2023</source>
          <year>2023</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2303.12712"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2303.12712</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Epstein</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>Assessment in medical education</article-title>
          <source>N Engl J Med</source>
          <year>2007</year>
          <volume>356</volume>
          <issue>4</issue>
          <fpage>387</fpage>
          <lpage>396</lpage>
          <pub-id pub-id-type="doi">10.1056/NEJMra054784</pub-id>
          <pub-id pub-id-type="medline">17251535</pub-id>
          <pub-id pub-id-type="pii">356/4/387</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Case</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Swanson</surname>
              <given-names>DB</given-names>
            </name>
          </person-group>
          <source>Constructing Written Test Questions For the Basic and Clinical Sciences, Third Edition</source>
          <year>2002</year>
          <publisher-loc>Philadelphia, PA</publisher-loc>
          <publisher-name>National Board of Medical Examiners</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="web">
          <article-title>R: a language and environment for statistical computing</article-title>
          <source>The R Project for Statistical Computing</source>
          <access-date>2024-01-02</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.R-project.org/">https://www.R-project.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Möltner</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schellberg</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Jünger</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Grundlegende quantitative Analysen medizinischer Prüfungen. Article in German</article-title>
          <source>GMS Z Med Ausbild</source>
          <year>2006</year>
          <volume>23</volume>
          <issue>3</issue>
          <fpage>23</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medizinische-fakultaet-hd.uni-heidelberg.de/fileadmin/kompzent/Moeltner_Quantitative_Analysen.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mayring</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Mey</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Mruck</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Qualitative inhaltsanalyse</article-title>
          <source>Handbuch qualitative Forschung in der Psychologie</source>
          <year>2010</year>
          <publisher-loc>Wiesbaden</publisher-loc>
          <publisher-name>VS Verlag für Sozialwissenschaften</publisher-name>
          <fpage>601</fpage>
          <lpage>613</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Moldt</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Festl-Wietek</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Madany-Mamlouk</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nieselt</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Fuhl</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Herrmann-Werner</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Chatbots for future docs: exploring medical students' attitudes and knowledge towards artificial intelligence and medical chatbots</article-title>
          <source>Med Educ Online</source>
          <year>2023</year>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>2182659</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36855245"/>
          </comment>
          <pub-id pub-id-type="doi">10.1080/10872981.2023.2182659</pub-id>
          <pub-id pub-id-type="medline">36855245</pub-id>
          <pub-id pub-id-type="pmcid">PMC9979998</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="web">
          <article-title>ICD-10: international statistical classification of diseases and related health problems: tenth revision, 2nd ed</article-title>
          <source>World Health Organization</source>
          <year>2004</year>
          <access-date>2023-08-05</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://apps.who.int/iris/handle/10665/42980">https://apps.who.int/iris/handle/10665/42980</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ohwada</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Hotta</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Oikawa</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Takano</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Etiology of hypercholesterolemia in patients with anorexia nervosa</article-title>
          <source>Int J Eat Disord</source>
          <year>2006</year>
          <volume>39</volume>
          <issue>7</issue>
          <fpage>598</fpage>
          <lpage>601</lpage>
          <pub-id pub-id-type="doi">10.1002/eat.20298</pub-id>
          <pub-id pub-id-type="medline">16791856</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weinbrenner</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Züger</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jacoby</surname>
              <given-names>GE</given-names>
            </name>
            <name name-style="western">
              <surname>Herpertz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liedtke</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sudhop</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Gouni-Berthold</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Axelson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Berthold</surname>
              <given-names>HK</given-names>
            </name>
          </person-group>
          <article-title>Lipoprotein metabolism in patients with anorexia nervosa: a case-control study investigating the mechanisms leading to hypercholesterolaemia</article-title>
          <source>Br J Nutr</source>
          <year>2004</year>
          <volume>91</volume>
          <issue>6</issue>
          <fpage>959</fpage>
          <lpage>969</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cambridge.org/core/journals/british-journal-of-nutrition/article/lipoprotein-metabolism-in-patients-with-anorexia-nervosa-a-casecontrol-study-investigating-the-mechanisms-leading-to-hypercholesterolaemia/A6A83B3BB862D5EDF5D89C53CD672490"/>
          </comment>
          <pub-id pub-id-type="doi">10.1079/BJN20041151</pub-id>
          <pub-id pub-id-type="medline">15182399</pub-id>
          <pub-id pub-id-type="pii">S0007114504001175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kosmiski</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Schmiege</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Mascolo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gaudiani</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mehler</surname>
              <given-names>PS</given-names>
            </name>
          </person-group>
          <article-title>Chronic starvation secondary to anorexia nervosa is associated with an adaptive suppression of resting energy expenditure</article-title>
          <source>J Clin Endocrinol Metab</source>
          <year>2014</year>
          <volume>99</volume>
          <issue>3</issue>
          <fpage>908</fpage>
          <lpage>914</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24302748"/>
          </comment>
          <pub-id pub-id-type="doi">10.1210/jc.2013-1694</pub-id>
          <pub-id pub-id-type="medline">24302748</pub-id>
          <pub-id pub-id-type="pii">jc.2013-1694</pub-id>
          <pub-id pub-id-type="pmcid">PMC3942230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takagi</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Watari</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Erabi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sakaguchi</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Performance of GPT-3.5 and GPT-4 on the Japanese Medical Licensing Examination: comparison study</article-title>
          <source>JMIR Med Educ</source>
          <year>2023</year>
          <volume>9</volume>
          <fpage>e48002</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://mededu.jmir.org/2023//e48002/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48002</pub-id>
          <pub-id pub-id-type="medline">37384388</pub-id>
          <pub-id pub-id-type="pii">v9i1e48002</pub-id>
          <pub-id pub-id-type="pmcid">PMC10365615</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lester</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Al-Rfou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Constant</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>The power of scale for parameter-efficient prompt tuning</article-title>
          <year>2021</year>
          <conf-name>Proceedings of the 2021 Conference on Empirical Methods in Natural Language Processing</conf-name>
          <conf-date>November 7–11, 2021</conf-date>
          <conf-loc>Online and Punta Cana, Dominican Republic</conf-loc>
          <publisher-name>Association for Computational Linguistics</publisher-name>
          <fpage>3045</fpage>
          <lpage>3059</lpage>
          <pub-id pub-id-type="doi">10.18653/v1/2021.emnlp-main.243</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lyu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zapadka</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Ponnatapura</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Niu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Myers</surname>
              <given-names>KJ</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Whitlow</surname>
              <given-names>CT</given-names>
            </name>
          </person-group>
          <article-title>Translating radiology reports into plain language using ChatGPT and GPT-4 with prompt learning: results, limitations, and potential</article-title>
          <source>Vis Comput Ind Biomed Art</source>
          <year>2023</year>
          <volume>6</volume>
          <issue>1</issue>
          <fpage>9</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37198498"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s42492-023-00136-5</pub-id>
          <pub-id pub-id-type="medline">37198498</pub-id>
          <pub-id pub-id-type="pii">10.1186/s42492-023-00136-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10192466</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dale</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>GPT-3: what’s it good for?</article-title>
          <source>Nat Lang Eng</source>
          <year>2020</year>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>113</fpage>
          <lpage>118</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cambridge.org/core/journals/natural-language-engineering/article/gpt3-whats-it-good-for/0E05CFE68A7AC8BF794C8ECBE28AA990"/>
          </comment>
          <pub-id pub-id-type="doi">10.1017/S1351324920000601</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sallam</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT utility in healthcare education, research, and practice: systematic review on the promising perspectives and valid concerns</article-title>
          <source>Healthcare (Basel)</source>
          <year>2023</year>
          <volume>11</volume>
          <issue>6</issue>
          <fpage>887</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare11060887"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare11060887</pub-id>
          <pub-id pub-id-type="medline">36981544</pub-id>
          <pub-id pub-id-type="pii">healthcare11060887</pub-id>
          <pub-id pub-id-type="pmcid">PMC10048148</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="web">
          <article-title>GPT-4</article-title>
          <source>OpenAI</source>
          <access-date>2023-08-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/gpt-4">https://openai.com/gpt-4</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Goodman</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Patrinely</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Stone</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zimmerman</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Donald</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Berkowitz</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Finn</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Jahangir</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Scoville</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Reese</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Friedman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bastarache</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van der Heijden</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Carter</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Alexander</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Choe</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chastain</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zic</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Horst</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Turker</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Osmundson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Idrees</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kieman</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Padmanabhan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bailey</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Schlegel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chambless</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gibson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Osterman</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Wheless</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Assessing the accuracy and reliability of AI-generated medical responses: an evaluation of the Chat-GPT model</article-title>
          <source>Research Square. Preprint posted online on February 28 2023</source>
          <year>2023</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36909565"/>
          </comment>
          <pub-id pub-id-type="doi">10.21203/rs.3.rs-2566942/v1</pub-id>
          <pub-id pub-id-type="medline">36909565</pub-id>
          <pub-id pub-id-type="pii">rs.3.rs-2566942</pub-id>
          <pub-id pub-id-type="pmcid">PMC10002821</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Frieske</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ishii</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bang</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Madotto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Survey of hallucination in natural language generation</article-title>
          <source>ACM Comput Surv</source>
          <year>2023</year>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>1</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1145/3571730</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bhayana</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bleakney</surname>
              <given-names>RR</given-names>
            </name>
            <name name-style="western">
              <surname>Krishna</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 in radiology: improvements in advanced reasoning</article-title>
          <source>Radiology</source>
          <year>2023</year>
          <volume>307</volume>
          <issue>5</issue>
          <fpage>e230987</fpage>
          <pub-id pub-id-type="doi">10.1148/radiol.230987</pub-id>
          <pub-id pub-id-type="medline">37191491</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zaharia</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Zou</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>How is ChatGPT's behavior changing over time?</article-title>
          <source>arXiv. Preprint posted online on July 18, 2023</source>
          <year>2023</year>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2307.09009"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2307.09009</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
