<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e68486</article-id>
      <article-id pub-id-type="pmid">39854611</article-id>
      <article-id pub-id-type="doi">10.2196/68486</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Virtual Patients Using Large Language Models: Scalable, Contextualized Simulation of Clinician-Patient Dialogue With Feedback</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Coristine</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Danni</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Foroutan</surname>
            <given-names>Dr Behzad</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Ojo</surname>
            <given-names>Tolulope Funmilola</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Menon Naliyatthaliyazchayil</surname>
            <given-names>Parvati</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Aditya</surname>
            <given-names>Ishan</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Cook</surname>
            <given-names>David A</given-names>
          </name>
          <degrees>MHPE, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution/>
            <institution>Division of General Internal Medicine</institution>
            <institution>Mayo Clinic College of Medicine and Science</institution>
            <addr-line>200 First St SW</addr-line>
            <addr-line>Rochester, MN, 55905</addr-line>
            <country>United States</country>
            <phone>1 507 266 4156</phone>
            <email>cook.david33@mayo.edu</email>
          </address>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-2383-4633</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Overgaard</surname>
            <given-names>Joshua</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0002-6013-1902</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Pankratz</surname>
            <given-names>V Shane</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-3742-040X</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Del Fiol</surname>
            <given-names>Guilherme</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9954-6799</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Aakre</surname>
            <given-names>Chris A</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9817-8533</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Division of General Internal Medicine</institution>
        <institution>Mayo Clinic College of Medicine and Science</institution>
        <addr-line>Rochester, MN</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Multidisciplinary Simulation Center</institution>
        <institution>Mayo Clinic College of Medicine and Science</institution>
        <addr-line>Rochester, MN</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Health Sciences Center</institution>
        <institution>University of New Mexico</institution>
        <addr-line>Albuquerque, NM</addr-line>
        <country>United States</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Department of Biomedical Informatics</institution>
        <institution>University of Utah School of Medicine</institution>
        <addr-line>Salt Lake City, UT</addr-line>
        <country>United States</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: David A Cook <email>cook.david33@mayo.edu</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>4</day>
        <month>4</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e68486</elocation-id>
      <history>
        <date date-type="received">
          <day>6</day>
          <month>11</month>
          <year>2024</year>
        </date>
        <date date-type="rev-request">
          <day>17</day>
          <month>12</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>3</day>
          <month>1</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>13</day>
          <month>1</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©David A Cook, Joshua Overgaard, V Shane Pankratz, Guilherme Del Fiol, Chris A Aakre. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 04.04.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e68486" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Virtual patients (VPs) are computer screen–based simulations of patient-clinician encounters. VP use is limited by cost and low scalability.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We aimed to show that VPs powered by large language models (LLMs) can generate authentic dialogues, accurately represent patient preferences, and provide personalized feedback on clinical performance. We also explored using LLMs to rate the quality of dialogues and feedback.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We conducted an intrinsic evaluation study rating 60 VP-clinician conversations. We used carefully engineered prompts to direct OpenAI’s generative pretrained transformer (GPT) to emulate a patient and provide feedback. Using 2 outpatient medicine topics (chronic cough diagnosis and diabetes management), each with permutations representing different patient preferences, we created 60 conversations (dialogues plus feedback): 48 with a human clinician and 12 “self-chat” dialogues with GPT role-playing both the VP and clinician. Primary outcomes were dialogue authenticity and feedback quality, rated using novel instruments for which we conducted a validation study collecting evidence of content, internal structure (reproducibility), relations with other variables, and response process. Each conversation was rated by 3 physicians and by GPT. Secondary outcomes included user experience, bias, patient preferences represented in the dialogues, and conversation features that influenced authenticity.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The average cost per conversation was US $0.51 for GPT-4.0-Turbo and US $0.02 for GPT-3.5-Turbo. Mean (SD) conversation ratings, maximum 6, were overall dialogue authenticity 4.7 (0.7), overall user experience 4.9 (0.7), and average feedback quality 4.7 (0.6). For dialogues created using GPT-4.0-Turbo, physician ratings of patient preferences aligned with intended preferences in 20 to 47 of 48 dialogues (42%-98%). Subgroup comparisons revealed higher ratings for dialogues using GPT-4.0-Turbo versus GPT-3.5-Turbo and for human-generated versus self-chat dialogues. Feedback ratings were similar for human-generated versus GPT-generated ratings, whereas authenticity ratings were lower. We did not perceive bias in any conversation. Dialogue features that detracted from authenticity included that GPT was verbose or used atypical vocabulary (93/180, 51.7% of conversations), was overly agreeable (n=56, 31%), repeated the question as part of the response (n=47, 26%), was easily convinced by clinician suggestions (n=35, 19%), or was not disaffected by poor clinician performance (n=32, 18%). For feedback, detractors included excessively positive feedback (n=42, 23%), failure to mention important weaknesses or strengths (n=41, 23%), or factual inaccuracies (n=39, 22%). Regarding validation of dialogue and feedback scores, items were meticulously developed (content evidence), and we confirmed expected relations with other variables (higher ratings for advanced LLMs and human-generated dialogues). Reproducibility was suboptimal, due largely to variation in LLM performance rather than rater idiosyncrasies.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>LLM-powered VPs can simulate patient-clinician dialogues, demonstrably represent patient preferences, and provide personalized performance feedback. This approach is scalable, globally accessible, and inexpensive. LLM-generated ratings of feedback quality are similar to human ratings.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>simulation training</kwd>
        <kwd>natural language processing</kwd>
        <kwd>computer-assisted instruction</kwd>
        <kwd>clinical decision-making</kwd>
        <kwd>clinical reasoning</kwd>
        <kwd>machine learning</kwd>
        <kwd>virtual patient</kwd>
        <kwd>natural language generation</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Translating advances in biomedical knowledge and knowledge synthesis into data-driven, patient-centered, and contextualized management decisions remains a wicked challenge. As we seek to prevent errors in clinical practice [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>] and promote high-value care [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], we need to better understand clinical reasoning and how to support its development and application [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Because clinical reasoning is case specific [<xref ref-type="bibr" rid="ref6">6</xref>] and educationally opportune encounters with real patients are finite, education and research in this field require a scalable approach to emulating authentic patient-clinician interactions. Virtual patients (VPs) powered by large language models (LLMs) offer a potential solution.</p>
      <p>VPs—computer screen–based simulations of patient-clinician encounters [<xref ref-type="bibr" rid="ref7">7</xref>]—have demonstrated efficacy in teaching, assessing, and studying clinical reasoning [<xref ref-type="bibr" rid="ref8">8</xref>] and could also support validation of decision-support tools before clinical implementation [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. VPs may be particularly important for <italic>management reasoning</italic>, which is a subset of clinical reasoning. In contrast with diagnostic reasoning, management reasoning is arguably more difficult, more complex to study, and more important [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. Yet, it has received scant investigation owing to challenges in replicating management tasks—most notably patient-clinician conversations—which necessarily involve shared decision-making [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] and contextualization of care (ie, consideration of social determinants of health, patient preferences, and comorbid conditions) [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref20">20</xref>].</p>
      <p>To date, VP use has been limited by the high costs and logistical challenges of large-scale implementation. One survey found that 85% of bespoke VPs cost &gt;US $10,000 per case and required &gt;16 months to produce [<xref ref-type="bibr" rid="ref21">21</xref>]. Commercial VP libraries exist, but subscriptions are expensive (approximately US $100/student/y). Hence, VP implementations typically comprise few cases and lack case-to-case variability in salient features (eg, diagnosis, illness severity, preferences, and ethnic diversity) [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      <p>Providing performance feedback to clinicians is also essential in clinical skill development [<xref ref-type="bibr" rid="ref23">23</xref>], yet it is commonly of low quality or simply absent [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref27">27</xref>]. Specific, actionable feedback [<xref ref-type="bibr" rid="ref28">28</xref>-<xref ref-type="bibr" rid="ref30">30</xref>] on VP-clinician interactions could promote clinical reasoning and communication skills.</p>
      <p>LLMs represent a disruptive technology [<xref ref-type="bibr" rid="ref31">31</xref>], offering an unprecedented opportunity to transform VP production and use, enabling scalable, accessible (ie, inexpensive and low expertise), interoperable, and reusable [<xref ref-type="bibr" rid="ref32">32</xref>] simulations of patient-clinician encounters. Our aim was to show proof of concept that VPs powered by OpenAI’s generative pretrained transformer (GPT) can generate authentic preference-sensitive dialogues and high-quality feedback. We hypothesized that human ratings of <italic>observed</italic> patient preferences would agree with corresponding <italic>planned</italic> preferences (ie, that GPT would perceptibly represent the intended preference). We compared GPT-4.0-Turbo against the earlier, cheaper GPT-3.5-Turbo, hypothesizing that GPT-4.0-Turbo would be superior. We also piloted GPT to role-play the clinician, hypothesizing that conversations involving human clinicians would be superior.</p>
      <p>As a substudy, we aimed to pilot LLMs for rating the quality of VP-clinician dialogues and feedback. Artificial intelligence (AI) has long been used to rate narrative text [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref37">37</xref>], but this typically requires supervised machine learning—using human-graded texts to train the AI system. We explored the use of LLMs without any training exemplars (ie, zero-shot learning).</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Overview</title>
        <p>We conducted an intrinsic evaluation study (ie, a study that evaluates the quality of computer-generated outputs on specific predefined tasks, rather than real-world learners or tasks), rating the quality of 60 conversations (ie, the combination of VP-clinician dialogue and LLM-generated performance feedback) between an LLM-powered VP and a clinician. We created 3 novel instruments to rate dialogue authenticity and feedback quality. Three physicians and GPT rated all conversations. <xref rid="figure1" ref-type="fig">Figure 1</xref> summarizes the study design.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Overview of the study design. GPT=GPT−4.0-turbo except as otherwise noted. A “conversation” refers to the virtual patient–clinician dialogue plus feedback. API: application programming interface; GPT: generative pretrained transformer.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e68486_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>No human subjects were involved in this study, other than the study investigators. As such, we did not pursue appraisal by an ethics review board.</p>
      </sec>
      <sec>
        <title>Technical Preparation: LLM-Powered VP Interface</title>
        <p>We used Python to create a text VP interface, as previously described [<xref ref-type="bibr" rid="ref38">38</xref>], that accesses GPT through the OpenAI application programming interface (API). We iteratively and rigorously engineered detailed “prompts” guiding GPT to emulate a diagnosis-focused or management-focused VP and provide feedback. To instantiate a specific VP, the interface accesses a 1-page case description. Narrative S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> reports the full prompt and 1 case description.</p>
      </sec>
      <sec>
        <title>Conversation Planning</title>
        <p>We selected as topics 2 common problems in ambulatory medicine: chronic cough (a diagnostic task) and diabetes (a management task). For each topic, we created a written description of a prototypical scenario. In this pilot study we did not base scenarios on specific real patients.</p>
        <p>We planned 4 permutations per topic by varying the patient preferences or GPT model:</p>
        <list list-type="bullet">
          <list-item>
            <p>Case 1: patient has good insurance and wants to avoid tests or new medications (GPT-4.0-Turbo)</p>
          </list-item>
          <list-item>
            <p>Case 2: patient has financial concerns such as limited income and poor insurance (GPT-4.0-Turbo)</p>
          </list-item>
          <list-item>
            <p>Case 3: patient is anxious and pushes for more tests and more aggressive treatments (GPT-4.0-Turbo)</p>
          </list-item>
          <list-item>
            <p>Case 4: same as case 1 (GPT-3.5-Turbo)</p>
          </list-item>
        </list>
        <p>The details on dialogue permutations are provided in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The dialogues were further permuted for 3 clinician personas: an average third-year medical student, a poor-performing third-year medical student, and an average second-year internal medicine resident.</p>
      </sec>
      <sec>
        <title>Conversation Creation</title>
        <p>We used the LLM-powered VP interface to create 48 simulated conversations between the VP and a human clinician. A representative conversation is provided in Narrative S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. A board-certified internal medicine physician role-played the clinician twice for each permutation (ie, 2 topics, 4 case variations, 3 clinician personas, and 2 replications=48 conversations). One investigator role-played all conversations for cough and another investigator role-played those for diabetes. The investigator knew which clinician persona to portray but was not told which case variation GPT portrayed. Using the instruments defined later in this report, the investigator rated dialogue quality immediately after ending each dialog. GPT (via the VP interface) then offered detailed performance feedback, and the investigator rated feedback quality and perceived bias.</p>
        <p>In addition, we used GPT-4.0-Turbo to play the role of an “excellent physician,” and “self-chat” as both the VP and clinician using independent GPT threads for cases 1 to 3, with 2 replications each (ie, 2 topics, 3 case variations, and 2 replications=12 GPT-GPT self-chats).</p>
        <p>Each conversation was saved verbatim, along with time spent, word count, and GPT “tokens” used. We calculated costs using GPT pricing.</p>
      </sec>
      <sec>
        <title>Instrument Creation</title>
        <sec>
          <title>Overview</title>
          <p>We created 3 novel instruments for rating the quality of VP dialogues and feedback (<xref ref-type="table" rid="table1">Table 1</xref>), and 1 item to flag potential bias. We also collected granular information on conversation features that influenced authenticity. For the 3 novel instruments, we conducted a validation study collecting validity evidence from 4 of 5 potential sources [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]: content (ie, grounding of the instruments in theory and prior empirical work); internal structure (ie, rating reproducibility); relations with other variables (ie, sensitivity of ratings to case differences, including expectation of higher ratings for more advanced LLM models and human clinician personas); and response process (ie, clarification on why raters responded as they did). Narrative S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> further describes instrument development and validation planning.</p>
          <table-wrap position="float" id="table1">
            <label>Table 1</label>
            <caption>
              <p>Rating scales for appraising conversation quality: constructs, items, operational clarifications, and reproducibility<sup>a</sup>.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="20"/>
              <col width="100"/>
              <col width="370"/>
              <col width="390"/>
              <col width="60"/>
              <col width="60"/>
              <thead>
                <tr valign="top">
                  <td colspan="2">Item</td>
                  <td>Verbatim item wording</td>
                  <td>Operational clarifications<sup>a</sup></td>
                  <td>ICC<sup>b</sup>: human (N=3)<sup>c</sup></td>
                  <td>ICC: GPT<sup>d</sup> (N=3)<sup>c</sup></td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Dialogue authenticity</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Humanlike</td>
                  <td>The virtual patient’s responses were humanlike.</td>
                  <td>Sensible, natural, and conversational; uses appropriate word choice, phrasing, and tone</td>
                  <td>0.34</td>
                  <td>0.29</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Coherent</td>
                  <td>The virtual patient’s responses were coherent.</td>
                  <td>Contextually appropriate and internally consistent (ie, logical) over the course of the dialogue</td>
                  <td>0.40</td>
                  <td>0.45</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Personal</td>
                  <td>The virtual patient’s responses were personal.</td>
                  <td>Reflecting preferences, opinions, values, and priorities; not overly agreeable or pleasing</td>
                  <td>0.22</td>
                  <td>0.35</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Relevant</td>
                  <td>The virtual patient’s responses were relevant and meaningful.</td>
                  <td>Meaningful, useful, helpful as a clinically relevant simulation; requires or supports clinical reasoning; stimulates appropriate emotions and empathy</td>
                  <td>0.30</td>
                  <td>0.20</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Overall</td>
                  <td>The dialogue as a whole mirrored a real-life patient-clinician conversation.</td>
                  <td>—<sup>e</sup></td>
                  <td>0.34</td>
                  <td>0.49</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>User experience</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Realness</td>
                  <td>This was an authentic representation of a real-world experience.</td>
                  <td>Similar to a real-world situation</td>
                  <td>0.37</td>
                  <td>0.29</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Cognitive authenticity</td>
                  <td>I had to continuously revise my mental image of the problem using new information.</td>
                  <td>Requires or stimulates the same mental activities, same decisions as in real situation; real professional demand</td>
                  <td>0.24</td>
                  <td>—</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Variability</td>
                  <td>The interaction seemed unscripted and appropriately complex.</td>
                  <td>Reflects natural variation in responses; spontaneous, unstructured, unplanned, and flexible; complex, multidimensional (not superficial); not robot-like or prefabricated</td>
                  <td>0.19</td>
                  <td>—</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Involvement<sup>f</sup></td>
                  <td>I was fully engaged in this conversation.</td>
                  <td>Immersed, focused (not distracted), captivated; stimulated empathy and authentic emotions</td>
                  <td>X<sup>g</sup></td>
                  <td>—</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Overall</td>
                  <td>I felt as if I were the doctor.</td>
                  <td>—</td>
                  <td>0.17</td>
                  <td>—</td>
                </tr>
                <tr valign="top">
                  <td colspan="6">
                    <bold>Feedback</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Evidence based</td>
                  <td>The feedback correctly identifies important weaknesses and strengths in the clinician’s performance.</td>
                  <td>Specific observations of behavior; accurately interpreted; well prioritized</td>
                  <td>0.15</td>
                  <td>0.09</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Actionable</td>
                  <td>The feedback contains suggestions that are specific and actionable.</td>
                  <td>Specific and actionable suggestions for behavior change</td>
                  <td>0.17</td>
                  <td>0.26</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Connected</td>
                  <td>The feedback correctly connects each suggestion with specific strengths and weaknesses.</td>
                  <td>Explicit and logical connection between the observed behaviors and suggested changes</td>
                  <td>0.22</td>
                  <td>0.25</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Balanced</td>
                  <td>The feedback balances corrective and reinforcing statements appropriate to the clinician’s performance.</td>
                  <td>Includes both praise and critique; a balance of positive and negative statements matches actual performance</td>
                  <td>0.08</td>
                  <td>0.16</td>
                </tr>
                <tr valign="top">
                  <td colspan="2">
                    <bold>Bias (overall)</bold>
                  </td>
                  <td>Did you detect any indication of bias or stereotyping in the dialogue or feedback?</td>
                  <td>Includes stereotyping, disparagement, dehumanization, erasure, and inequitable performance</td>
                  <td>1<sup>f</sup></td>
                  <td>—</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table1fn1">
                <p><sup>a</sup>All conversations were rated at the time of their creation by the physician who created them (“initial ratings”) and later by blinded human raters and by GPT (“final ratings”). Items were presented in the sequence shown above. Operational clarifications were included only for final ratings. A “conversation” refers to the VP-clinician dialogue plus feedback. During conversation creation, each dialogue was rated before feedback was provided. Response options for all rating scale items ranged from 1=strongly disagree to 6=strongly agree. For authenticity and experience, a rating of 6 was operationally defined as “This is exactly what I would expect in a real conversation; this could have come from a human patient.” For feedback, a rating of 6 was operationally defined as “This is surprisingly good, better than I would expect from a trained human clinician-supervisor.” See Box S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for additional details on operational criteria. Response options for bias were Yes and No.</p>
              </fn>
              <fn id="table1fn2">
                <p><sup>b</sup>ICC: intraclass correlation coefficient.</p>
              </fn>
              <fn id="table1fn3">
                <p><sup>c</sup>An ICC representing the overall reproducibility coefficient for a single rating. “Human” indicates agreement across 3 blinded board-certified internal medicine physicians; “GPT” indicates agreement across 3 rating runs from GPT-4.0-Turbo.</p>
              </fn>
              <fn id="table1fn4">
                <p><sup>d</sup>GPT: generative pretrained transformer.</p>
              </fn>
              <fn id="table1fn5">
                <p><sup>e</sup>GPT did not rate user experience and bias.</p>
              </fn>
              <fn id="table1fn6">
                <p><sup>f</sup>This item was created as part of our instrument, reflecting the corresponding domain in the underlying conceptual framework. However, we did not code this feature in this study, as we investigators did not feel authentically “engaged” in the task when creating multiple conversations. This item could be used in future studies with real learners.</p>
              </fn>
              <fn id="table1fn7">
                <p><sup>g</sup>There was 100% agreement across all raters on the bias item.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
        <sec>
          <title>Dialogue Rating Items</title>
          <p>Two instruments focused on the dialogues: dialogue authenticity and user (ie, clinician) experience. To generate items to rate dialogue authenticity, we drew on the literature on dialogue systems and natural language generation [<xref ref-type="bibr" rid="ref41">41</xref>-<xref ref-type="bibr" rid="ref51">51</xref>] from which we distilled 5 repeatedly emphasized constructs: responses are <italic>humanlike</italic> (ie, sensible, natural, and avoiding bias), <italic>coherent</italic> (ie, contextually appropriate and internally consistent), engaging or <italic>personal</italic> (ie, reflecting preferences, empathy, and personality), helpful or <italic>relevant</italic> (ie, specific, useful, and meaningful), and <italic>correct</italic> (for knowledge-delivery systems). We dropped "correct" since our purpose was dialogue and not knowledge delivery. We considered but omitted a domain for fluency because recent literature suggests that fluency can be presumed for contemporary AI models [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref44">44</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. We created 1 item for each construct and an overall item, resulting in a 5-item instrument.</p>
          <p>To generate items to rate user experience, we merged 2 conceptual frameworks for measuring authenticity in VPs—one emphasizing decision-making and cognitive strategies [<xref ref-type="bibr" rid="ref52">52</xref>] and the other highlighting realism, empathy, and variability [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. We added a third empirically derived framework for evaluating “presence” in virtual reality (ie, realness, involvement, and spatial “physical” presence) [<xref ref-type="bibr" rid="ref54">54</xref>,<xref ref-type="bibr" rid="ref55">55</xref>]. We synthesized these into 4 constructs: <italic>realness</italic> (ie, similar to a real-world situation); <italic>cognitive authenticity</italic> (ie, real mental activities and decisions); <italic>variability</italic> (ie, case-to-case variation and spontaneous responses); and <italic>involvement</italic> (ie, user engaged and immersed). We created 1 item for each construct and an overall item, resulting in a 5-item instrument. In this study, we did not rate “involvement” because we never felt “immersed” when creating and rating multiple conversations; however, we plan to rate this in future studies.</p>
        </sec>
        <sec>
          <title>Feedback Items</title>
          <p>To generate items to rate feedback, we integrated findings from focus group studies [<xref ref-type="bibr" rid="ref24">24</xref>,<xref ref-type="bibr" rid="ref28">28</xref>], published instruments [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref56">56</xref>,<xref ref-type="bibr" rid="ref57">57</xref>], and other empirical and conceptual studies [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref58">58</xref>-<xref ref-type="bibr" rid="ref61">61</xref>] and identified 4 recurrent constructs: <italic>evidence-based</italic> (ie, behavior-focused) observations; specific, <italic>actionable</italic> suggestions; observations explicitly <italic>connected</italic> with suggestions; and <italic>balanced</italic> praise and critique. We created 1 item for each construct, resulting in a 4-item instrument. We did not rate feedback “overall”; instead, we calculated the average rating.</p>
        </sec>
        <sec>
          <title>Further Procedures for Dialogue and Feedback Instruments</title>
          <p>Three experts in VPs or natural language generation reviewed the 3 instruments and approved them with minor clarifications. Response options ranged from 1=strongly disagree to 6=strongly agree. After case creation and before the final rating phase, we added brief operational criteria for each response option (Box S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        </sec>
        <sec>
          <title>Bias Item</title>
          <p>Bias—“skew that produces a type of harm toward different social groups” [<xref ref-type="bibr" rid="ref62">62</xref>]—is a well-known risk in AI generally and natural language generation specifically [<xref ref-type="bibr" rid="ref62">62</xref>-<xref ref-type="bibr" rid="ref65">65</xref>]. Bias can arise from the input (ie, training) data, annotation process, input representations, models, or research design [<xref ref-type="bibr" rid="ref63">63</xref>], resulting in harms of stereotyping, disparagement, dehumanization, erasure, and inequitable performance [<xref ref-type="bibr" rid="ref62">62</xref>] to nondominant groups. These groups can be defined by demographics such as gender, age, gender orientation, physical appearance, disability, nationality, ethnicity, race, socioeconomic status, religion, and culture [<xref ref-type="bibr" rid="ref64">64</xref>]. Raters were instructed to flag and describe any bias or stereotyping in the dialogue or feedback, specifically considering the sources and groups noted earlier.</p>
        </sec>
        <sec>
          <title>Conversation Features That Influenced Authenticity</title>
          <p>Following the dialogue ratings, and again after the feedback ratings, we asked, “What specific features of this [dialog | feedback] detracted from its authenticity?” and “What specific features enhanced its authenticity?” Investigators responded using free text during conversation creation. We collated responses into a list of features and selected from this list during the final ratings.</p>
        </sec>
      </sec>
      <sec>
        <title>Final Ratings of Conversations</title>
        <p>As described earlier, each investigator rated conversation quality at the time of conversation creation.</p>
        <p>Later, all conversations were rated again by all 3 investigators for dialogue authenticity, user experience, feedback quality, and bias (ie, “final ratings”). At this stage, raters also indicated their perception of patient preferences represented in the dialogue regarding (1) less versus more testing, (2) the importance of cost, and (3) prioritization of lifestyle or control of illness. They also indicated specific features of the conversation that detracted from or enhanced its authenticity.</p>
        <p>Raters were blinded to the permutation. Conversations were randomized for final ratings (ie, a unique sequence for each rater). Raters entered data using an internet-based form implemented using DistillerSR.</p>
        <p>We also used GPT-4.0-Turbo (via the OpenAI API) to rate each conversation 3 times for dialogue authenticity and feedback quality but not user experience.</p>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <sec>
          <title>Reproducibility of Final Ratings</title>
          <p>To appraise rating reproducibility, we estimated variance components and calculated a single-rating intraclass correlation coefficient (ICC), which was interpreted using criteria from Landis and Koch [<xref ref-type="bibr" rid="ref66">66</xref>] (ie, 0-0.2=slight; 0.21-0.4=fair; 0.41-0.6=moderate; and 0.61-0.8=substantial).</p>
        </sec>
        <sec>
          <title>Comparison Across Design Features</title>
          <p>We selected 5 outcomes (ie, overall authenticity, humanlike, overall experience, realness, and average feedback) as most aligned with our study aims and compared these across GPT models, topics, clinician personas, and human versus LLM raters. Using mixed models ANOVA, we conducted paired analyses that accounted for features of the factorial design and, for final ratings, repeated measures from multiple raters. We used SAS 9.4 (SAS Institute Inc) for all analyses and set the α level at .05. We make inferences of statistical significance using 95% CIs.</p>
        </sec>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Instrument Validation</title>
        <p>We conducted a validation study for the novel instruments for rating dialogue authenticity, user experience, and feedback quality. Evidence for content is presented in the Methods section and Narrative S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Additional evidence is presented and discussed subsequently, including evidence for internal structure (ie, rating reproducibility was suboptimal), relations with other variables (ie, ratings differed as expected across conversation subgroups), and response process (ie, questions probed investigators’ thought processes regarding features that detracted from or enhanced conversation quality).</p>
      </sec>
      <sec>
        <title>Conversation Creation Resources</title>
        <p>We created 48 VP-clinician conversations (ie, dialogue plus feedback) with human physicians playing the clinician role and 12 conversations with GPT as the clinician. Each human-created conversation lasted for an average of 622 seconds (of which GPT’s responses took 90 seconds) and cost US $0.50 (see <xref ref-type="table" rid="table2">Table 2</xref> for additional details including estimates of measurement variability, ie, SD).</p>
        <p>GPT-3.5-Turbo was significantly faster than GPT-4.0-Turbo (62 vs 100 seconds; difference 38, 95% CI 29-47) and much cheaper (US $0.02 vs US $0.51 per conversation), although quality was substantially lower (see the subsequent section). Compared with diabetes, cough conversations required substantially more GPT time (122 vs 59 seconds) and tokens (72,745 vs 27,241) even though the dialogue itself was only slightly longer (1165 vs 908 words). This was due to more back-and-forth turns in the dialogue (mean 37 vs 14 turns), because each time GPT processes a clinician statement (eg, even a short query like “Do you have heartburn?”), the entire dialogue is resubmitted to GPT as context.</p>
        <p>The average time for the 12 GPT-GPT (ie, self-chat) conversations was 113 seconds: 62 seconds for the clinician, and 51 seconds for the VP. The average cost was US $0.29 because these dialogues had fewer turns (mean 21 turns).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Conversation creation: resource metrics and initial ratings of conversation quality<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="190"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Metric</td>
                <td colspan="6">Human clinician, mean (SD), median</td>
                <td>Self-chat (all, n=12), mean (SD), median</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>All (n=48)</td>
                <td>GPT-4.0 (n=36)</td>
                <td>GPT-3.5 (n=12)</td>
                <td>Diabetes (n=24)</td>
                <td>Cough (n=24)</td>
                <td colspan="2">
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>Resources and time</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Total time (s)<sup>b</sup></td>
                <td>622 (173), 611</td>
                <td>653 (168), 669</td>
                <td>551 (171), 508</td>
                <td>617 (158), 611</td>
                <td>627 (189), 619</td>
                <td colspan="2">113 (20), 107</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Physician time (s)<sup>b</sup></td>
                <td>534 (166), 553</td>
                <td>553 (162), 556</td>
                <td>488 (173), 477</td>
                <td>562 (151), 553</td>
                <td>510 (178), 511</td>
                <td colspan="2">62 (14), 57</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Virtual patient (GPT) time (s)</td>
                <td>90 (38), 76</td>
                <td>100 (36), 99</td>
                <td>62 (28), 63</td>
                <td>59 (16), 65</td>
                <td>122 (24), 129</td>
                <td colspan="2">51 (8), 51</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Words (dialogue)<sup>c</sup></td>
                <td>1037 (302), 1003</td>
                <td>1092 (304), 1059</td>
                <td>871 (238), 810</td>
                <td>908 (232), 942</td>
                <td>1165 (313), 1165</td>
                <td colspan="2">1377 (351), 1291</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Words (feedback)<sup>c</sup></td>
                <td>387 (118), 425</td>
                <td>449 (50), 450</td>
                <td>202 (38), 198</td>
                <td>371 (94), 413</td>
                <td>403 (138), 458</td>
                <td colspan="2">424 (43), 407</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Tokens (total)<sup>c</sup></td>
                <td>49,993 (25,609), 47,621</td>
                <td>50,788 (25,788), 46,826</td>
                <td>47,607 (26,036), 47,894</td>
                <td>27,241 (6139), 26,205</td>
                <td>72,745 (14,904), 66,960</td>
                <td colspan="2">28,628 (7997), 27,209</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Dialogue turns<sup>c</sup></td>
                <td>26 (13), 24</td>
                <td>26 (13), 24</td>
                <td>26 (13), 24</td>
                <td>14 (3), 15</td>
                <td>37 (7), 34</td>
                <td colspan="2">21 (6), 20</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cost per conversation, US $<sup>d</sup></td>
                <td>0.50 (0.26), 0.48</td>
                <td>0.51 (0.26), 0.47</td>
                <td>0.02 (0.01), 0.02</td>
                <td>0.27 (0.06), 0.26</td>
                <td>0.73 (0.15), 0.67</td>
                <td colspan="2">0.29 (0.08), 0.27</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Dialogue authenticity<sup>e</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Overall</td>
                <td>4.6 (0.6), 5</td>
                <td>4.8 (0.6), 5</td>
                <td>3.9 (0.3), 4</td>
                <td>4.5 (0.6), 4.5</td>
                <td>4.8 (0.7), 5</td>
                <td colspan="2">—<sup>f</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Humanlike</td>
                <td>4.8 (0.7), 5</td>
                <td>5.1 (0.5), 5</td>
                <td>3.9 (0.5), 4</td>
                <td>4.7 (0.6), 5</td>
                <td>4.9 (0.8), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Coherent</td>
                <td>5.4 (0.6), 5</td>
                <td>5.5 (0.6), 5.5</td>
                <td>5.3 (0.7), 5</td>
                <td>4.9 (0.3), 5</td>
                <td>6.0 (0.2), 6</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Personal</td>
                <td>5.0 (0.7), 5</td>
                <td>5.4 (0.5), 5</td>
                <td>4.1 (0.3), 4</td>
                <td>4.8 (0.4), 5</td>
                <td>5.3 (0.8), 6</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Relevant</td>
                <td>5.3 (0.7), 5</td>
                <td>5.4 (0.6), 5</td>
                <td>4.7 (0.7), 5</td>
                <td>4.9 (0.4), 5</td>
                <td>5.6 (0.6), 6</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>User experience<sup>e</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Overall</td>
                <td>4.9 (0.6), 5</td>
                <td>5.0 (0.5), 5</td>
                <td>4.4 (0.5), 4</td>
                <td>4.7 (0.5), 5</td>
                <td>5.0 (0.6), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Realness</td>
                <td>4.6 (0.7), 5</td>
                <td>4.8 (0.7), 5</td>
                <td>4.0 (0.4), 4</td>
                <td>4.3 (0.8), 5</td>
                <td>4.8 (0.6), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Cognitive authenticity</td>
                <td>4.5 (0.8), 4</td>
                <td>4.6 (0.8), 5</td>
                <td>4.2 (0.7), 4</td>
                <td>3.9 (0.4), 4</td>
                <td>5.1 (0.5), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Variability</td>
                <td>5.0 (0.7), 5</td>
                <td>5.2 (0.6), 5</td>
                <td>4.5 (0.7), 5</td>
                <td>4.8 (0.4), 5</td>
                <td>5.3 (0.8), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Feedback<sup>e</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Average</td>
                <td>4.6 (0.9), 5</td>
                <td>4.9 (0.6), 5</td>
                <td>3.7 (1.0), 4</td>
                <td>4.4 (0.9), 5</td>
                <td>4.9 (0.8), 4.6</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Evidence based</td>
                <td>4.3 (1.1), 4.5</td>
                <td>4.6 (0.9), 5</td>
                <td>3.5 (1.0), 3.5</td>
                <td>4.4 (1.0), 5</td>
                <td>4.3 (1.1), 4</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Actionable</td>
                <td>4.9 (0.8), 5</td>
                <td>5.2 (0.5), 5</td>
                <td>4.0 (1.0), 4</td>
                <td>4.6 (0.8), 5</td>
                <td>5.3 (0.7), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Connected</td>
                <td>4.8 (0.9), 5</td>
                <td>5.1 (0.6), 5</td>
                <td>3.8 (0.9), 4</td>
                <td>4.5 (0.8), 5</td>
                <td>5.1 (0.9), 5</td>
                <td colspan="2">—</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Balanced</td>
                <td>4.5 (1.1), 5</td>
                <td>4.8 (0.9), 5</td>
                <td>3.6 (1.3), 4</td>
                <td>4.2 (1.2), 5</td>
                <td>4.8 (0.9), 5</td>
                <td colspan="2">—</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>The clinician was a human physician for the “human clinician” conversations and GPT-4.0-Turbo for the “self-chat” conversations. The virtual patient was GPT for all conversations.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>n=37 for total time and human physician time, after excluding 11 conversations in which the recorded time was inexact due to interruptions.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>Dialogue was generated as an interaction between the virtual patient (GPT) and clinician (human or GPT). Feedback was generated by GPT. A “conversation” refers to the VP-clinician dialogue plus feedback. Tokens include entire conversation (both dialogue and feedback; and for self-chat, both patient and physician).</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>Pricing (per OpenAI, May 30, 2024): US $1.00/100,000 tokens for GPT-4.0-Turbo; US $0.05/100,000 tokens for GPT-3.5-Turbo.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>All conversations (dialogue and feedback) were rated at the time of their creation by the physician who created them, immediately following the dialogue and feedback (GPT did not provide initial ratings following self-chat). Response options for all items ranged from 1=strongly disagree to 6=strongly agree.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>Not applicable.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Representation of Patient Preferences</title>
        <p>Each case was written to represent patient preferences in testing or treatment, cost of care, and prioritization of illness control versus lifestyle. During the blinded final rating, we independently indicated whether the VP represented such preferences in the dialogue. The reproducibilities (ie, ICCs) for these ratings were as follows: testing or treatment, 0.59; cost of care, 0.75; and prioritization of control, 0.39.</p>
        <p>VPs demonstrably represented planned preferences with high frequency (<xref ref-type="table" rid="table3">Table 3</xref>). For dialogues created using GPT-4.0-Turbo, 5 of 6 nonneutral planned preferences were recognized as such in ≥54% of dialogues, and all 3 neutral planned preferences were rated as “no opinion” in ≥90% of the dialogues. We observed comparable results for GPT-3.5-Turbo.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Patient preferences reflected in dialogues: planned versus perceived by raters.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="170"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="150"/>
            <col width="0"/>
            <col width="130"/>
            <col width="0"/>
            <col width="130"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Perceived preference (human rating)</td>
                <td colspan="2">Case 1 (n=48<sup>a</sup>), n (%)</td>
                <td colspan="2">Case 2 (n=48<sup>a</sup>), n (%)</td>
                <td colspan="2">Case 3 (n=48<sup>a</sup>), n (%)</td>
                <td colspan="2">Case 1 GPT-3.5 (n=36)<sup>a</sup>, n (%)</td>
                <td colspan="2">Diabetes (n=90), n (%)</td>
                <td colspan="2">Cough (n=90), n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="14">
                  <bold>Testing or treatment</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Less</td>
                <td colspan="2">
                  <italic>20 (42)</italic>
                  <sup>b</sup>
                </td>
                <td colspan="2">
                  <italic>35 (73)</italic>
                </td>
                <td colspan="2">0 (0)</td>
                <td colspan="2">
                  <italic>17 (47)</italic>
                </td>
                <td colspan="2">41 (46)</td>
                <td>31 (34)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">No opinion</td>
                <td colspan="2">27 (56)</td>
                <td colspan="2">13 (27)</td>
                <td colspan="2">11 (23)</td>
                <td colspan="2">17 (47)</td>
                <td colspan="2">25 (28)</td>
                <td>43 (48)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">More</td>
                <td colspan="2">1 (2)</td>
                <td colspan="2">0 (0)</td>
                <td colspan="2">
                  <italic>37 (77)</italic>
                </td>
                <td colspan="2">2 (6)</td>
                <td colspan="2">24 (27)</td>
                <td>16 (18)</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Cost</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Lower</td>
                <td colspan="2">3 (6)</td>
                <td colspan="2">
                  <italic>47 (98)</italic>
                </td>
                <td colspan="2">1 (2)</td>
                <td colspan="2">2 (6)</td>
                <td colspan="2">25 (28)</td>
                <td>28 (31)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">No opinion</td>
                <td colspan="2">
                  <italic>43 (90)</italic>
                </td>
                <td colspan="2">1 (2)</td>
                <td colspan="2">21 (44)</td>
                <td colspan="2">
                  <italic>29 (81)</italic>
                </td>
                <td colspan="2">39 (43)</td>
                <td>55 (61)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Not an issue</td>
                <td colspan="2">2 (4)</td>
                <td colspan="2">0 (0)</td>
                <td colspan="2">
                  <italic>26 (54)</italic>
                </td>
                <td colspan="2">5 (14)</td>
                <td colspan="2">26 (29)</td>
                <td>7 (8)</td>
              </tr>
              <tr valign="top">
                <td colspan="14">
                  <bold>Impact on life</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Prioritize lifestyle</td>
                <td colspan="2">3 (6)</td>
                <td colspan="2">3 (6)</td>
                <td colspan="2">0 (0)</td>
                <td colspan="2">1 (3)</td>
                <td colspan="2">6 (7)</td>
                <td>1 (1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">No opinion</td>
                <td colspan="2">
                  <italic>45 (94)</italic>
                </td>
                <td colspan="2">
                  <italic>43 (90)</italic>
                </td>
                <td colspan="2">19 (40)</td>
                <td colspan="2">
                  <italic>34 (94)</italic>
                </td>
                <td colspan="2">65 (72)</td>
                <td>76 (84)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="2">Prioritize illness control</td>
                <td colspan="2">0 (0)</td>
                <td colspan="2">2 (4)</td>
                <td colspan="2">
                  <italic>29 (60)</italic>
                </td>
                <td colspan="2">1 (3)</td>
                <td colspan="2">19 (21)</td>
                <td>13 (14)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>This table indicates patient preferences as planned and prompted in the case description provided to the generative pretrained transformer (GPT), and preferences as perceived by blinded human raters to be represented in the dialogues. Case 1 was planned to reflect desire for less testing or treatment. Case 2 was planned to reflect strong desire for lower cost, and hence less testing or treatment. Case 3 was planned to reflect desire for more testing or treatment, cost not an issue, and prioritization of illness control over lifestyle. See Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for details on planned case features.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>Italicized values indicate dialogues in which prompted and perceived preferences align.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Conversation Quality: Authenticity, Experience, and Feedback</title>
        <p>Conversation quality was appraised by 1 rater at the time of creation and later by all 3 investigators (final ratings).</p>
        <sec>
          <title>Conversation Creation</title>
          <p>During creation, mean dialogue ratings ranged from 4.8 to 5.4 (out of a maximum rating of 6) for authenticity and from 4.5 to 5.0 for user experience (<xref ref-type="table" rid="table2">Table 2</xref>). Feedback quality ranged from 4.3 to 4.9. Ratings were significantly higher for GPT-4.0-Turbo versus GPT-3.5-Turbo (difference: dialogue overall 0.92, 95% CI 0.64-1.19; experience overall 0.58, 95% CI 0.21-0.96; feedback average 1.33, 95% CI 0.80-1.87).</p>
        </sec>
        <sec>
          <title>Final Ratings</title>
          <p>The reproducibilities of authenticity and experience final ratings were typically “fair,” with ICCs ranging from 0.17 to 0.40 (<xref ref-type="table" rid="table1">Table 1</xref>). In contrast, reproducibilities for feedback ratings were “slight,” with all but 1 domain ≤0.17. We examined the variance components (Tables S2 and S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>) and found very small between-rater variances (representing ≤5% of total variance for all except for feedback evidence based, which was 18%). In contrast, we found large (≥60% of total) between-replication variances, which reflect a combination of true differences in GPT performances and within-rater variability.</p>
          <p>Mean final ratings ranged from 4.6 to 5.0 for authenticity, 4.6 to 4.9 for experience, and 4.5 to 4.9 for feedback (see <xref ref-type="table" rid="table4">Table 4</xref> and Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for details, including estimates of measurement variability and subgroup analyses).</p>
          <table-wrap position="float" id="table4">
            <label>Table 4</label>
            <caption>
              <p>Final ratings of conversation quality: mean and median scores<sup>a</sup>.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="30"/>
              <col width="160"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <col width="0"/>
              <col width="90"/>
              <thead>
                <tr valign="top">
                  <td colspan="3">
                    <break/>
                  </td>
                  <td colspan="11">Rater, mean (SD), median</td>
                  <td colspan="7">Case, mean (SD), median</td>
                </tr>
                <tr valign="top">
                  <td colspan="3">
                    <break/>
                  </td>
                  <td colspan="2">All human raters (N=180)</td>
                  <td colspan="2">GPT<sup>b</sup> rater (N=180)</td>
                  <td colspan="2">Human rater 1 (N=60)</td>
                  <td colspan="2">Human rater 2 (N=60)</td>
                  <td colspan="2">Human rater 3 (N=60)</td>
                  <td colspan="3">Case 1 (N=48)</td>
                  <td colspan="2">Case 2 (N=48)</td>
                  <td colspan="2">Case 3 (N=48)</td>
                  <td>Case 1, GPT-3.5<sup>c</sup> (N=36)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td colspan="21">
                    <bold>Dialogue authenticity</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Overall</td>
                  <td colspan="2">4.7 (0.7), 5</td>
                  <td colspan="2">5.2 (0.6), 5</td>
                  <td colspan="2">4.8 (0.8), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                  <td colspan="2">4.6 (0.7), 5</td>
                  <td colspan="3">4.7 (0.7), 5</td>
                  <td colspan="2">4.9 (0.8), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                  <td colspan="2">4.4 (0.8), 5</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Humanlike</td>
                  <td colspan="2">4.6 (0.8), 5</td>
                  <td colspan="2">5.6 (0.5), 6</td>
                  <td colspan="2">4.8 (0.9), 5</td>
                  <td colspan="2">4.6 (0.5), 5</td>
                  <td colspan="2">4.5 (0.8), 5</td>
                  <td colspan="3">4.5 (0.7), 5</td>
                  <td colspan="2">5.0 (0.7), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                  <td colspan="2">4.1 (0.9), 4</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Coherent</td>
                  <td colspan="2">5.0 (0.6), 5</td>
                  <td colspan="2">5.6 (0.5), 6</td>
                  <td colspan="2">5.0 (0.8), 5</td>
                  <td colspan="2">4.9 (0.5), 5</td>
                  <td colspan="2">5.0 (0.6), 5</td>
                  <td colspan="3">5.0 (0.5), 5</td>
                  <td colspan="2">5.1 (0.5), 5</td>
                  <td colspan="2">5.2 (0.4), 5</td>
                  <td colspan="2">4.4 (0.9), 5</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Personal</td>
                  <td colspan="2">5.0 (0.6), 5</td>
                  <td colspan="2">5.1 (0.6), 5</td>
                  <td colspan="2">5.2 (0.9), 5</td>
                  <td colspan="2">5.0 (0.2), 5</td>
                  <td colspan="2">4.9 (0.7), 5</td>
                  <td colspan="3">5.0 (0.5), 5</td>
                  <td colspan="2">5.2 (0.6), 5</td>
                  <td colspan="2">5.1 (0.7), 5</td>
                  <td colspan="2">4.6 (0.6), 5</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Relevant</td>
                  <td colspan="2">4.9 (0.6), 5</td>
                  <td colspan="2">5.8 (0.4), 6</td>
                  <td colspan="2">4.8 (0.9), 5</td>
                  <td colspan="2">4.9 (0.4), 5</td>
                  <td colspan="2">4.9 (0.5), 5</td>
                  <td colspan="3">4.9 (0.5), 5</td>
                  <td colspan="2">5.0 (0.7), 5</td>
                  <td colspan="2">5.0 (0.5), 5</td>
                  <td colspan="2">4.6 (0.8), 5</td>
                </tr>
                <tr valign="top">
                  <td colspan="21">
                    <bold>User experience</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Overall</td>
                  <td colspan="2">4.9 (0.7), 5</td>
                  <td colspan="2">—<sup>d</sup></td>
                  <td colspan="2">4.9 (1.0), 5</td>
                  <td colspan="2">4.9 (0.3), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                  <td colspan="3">4.7 (0.7), 5</td>
                  <td colspan="2">5.1 (0.7), 5</td>
                  <td colspan="2">4.9 (0.7), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Realness</td>
                  <td colspan="2">4.6 (0.8), 5</td>
                  <td colspan="2">—</td>
                  <td colspan="2">4.7 (1.1), 5</td>
                  <td colspan="2">4.6 (0.6), 5</td>
                  <td colspan="2">4.5 (0.8), 5</td>
                  <td colspan="3">4.6 (0.7), 5</td>
                  <td colspan="2">4.9 (0.8), 5</td>
                  <td colspan="2">4.8 (0.8), 5</td>
                  <td colspan="2">4.1 (0.9), 4</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Cognitive authenticity</td>
                  <td colspan="2">4.8 (0.7), 5</td>
                  <td colspan="2">—</td>
                  <td colspan="2">5.0 (0.9), 5</td>
                  <td colspan="2">4.9 (0.3), 5</td>
                  <td colspan="2">4.6 (0.7), 5</td>
                  <td colspan="3">4.8 (0.7), 5</td>
                  <td colspan="2">5.0 (0.7), 5</td>
                  <td colspan="2">4.9 (0.8), 5</td>
                  <td colspan="2">4.7 (0.5), 5</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Variability</td>
                  <td colspan="2">4.8 (0.9), 5</td>
                  <td colspan="2">—</td>
                  <td colspan="2">4.6 (1.2), 5</td>
                  <td colspan="2">4.9 (0.3), 5</td>
                  <td colspan="2">4.7 (0.8), 5</td>
                  <td colspan="3">4.6 (0.9), 5</td>
                  <td colspan="2">5.0 (0.8), 5</td>
                  <td colspan="2">4.8 (0.9), 5</td>
                  <td colspan="2">4.5 (0.9), 5</td>
                </tr>
                <tr valign="top">
                  <td colspan="21">
                    <bold>Feedback</bold>
                  </td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Average</td>
                  <td colspan="2">4.7 (0.6), 4.9</td>
                  <td colspan="2">4.6 (0.2), 4.8</td>
                  <td colspan="2">4.8 (0.8), 4.9</td>
                  <td colspan="2">4.8 (0.4), 5</td>
                  <td colspan="2">4.5 (0.6), 4.8</td>
                  <td colspan="3">4.8 (0.5), 5</td>
                  <td colspan="2">4.9 (0.6), 5</td>
                  <td colspan="2">4.8 (0.5), 5</td>
                  <td colspan="2">4.1 (0.7), 4.1</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Evidence based</td>
                  <td colspan="2">4.5 (0.9), 5</td>
                  <td colspan="2">4.9 (0.3), 5</td>
                  <td colspan="2">4.7 (1.0), 5</td>
                  <td colspan="2">4.8 (0.4), 5</td>
                  <td colspan="2">4.1 (0.9), 4</td>
                  <td colspan="3">4.7 (0.7), 5</td>
                  <td colspan="2">4.7 (0.8), 5</td>
                  <td colspan="2">4.6 (0.8), 5</td>
                  <td colspan="2">3.9 (1.1), 4</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Actionable</td>
                  <td colspan="2">4.9 (0.6), 5</td>
                  <td colspan="2">4.5 (0.5), 5</td>
                  <td colspan="2">5.1 (0.8), 5</td>
                  <td colspan="2">4.9 (0.3), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                  <td colspan="3">5.0 (0.5), 5</td>
                  <td colspan="2">5.1 (0.5), 5</td>
                  <td colspan="2">5.1 (0.5), 5</td>
                  <td colspan="2">4.4 (0.7), 5</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Connected</td>
                  <td colspan="2">4.9 (0.7), 5</td>
                  <td colspan="2">4.6 (0.5), 5</td>
                  <td colspan="2">4.9 (0.9), 5</td>
                  <td colspan="2">4.9 (0.4), 5</td>
                  <td colspan="2">4.8 (0.6), 5</td>
                  <td colspan="3">5.1 (0.5), 5</td>
                  <td colspan="2">5.0 (0.7), 5</td>
                  <td colspan="2">5.1 (0.4), 5</td>
                  <td colspan="2">4.2 (0.8), 4</td>
                </tr>
                <tr valign="top">
                  <td>
                    <break/>
                  </td>
                  <td>Balanced</td>
                  <td colspan="2">4.5 (0.9), 5</td>
                  <td colspan="2">4.5 (0.6), 5</td>
                  <td colspan="2">4.5 (1.1), 5</td>
                  <td colspan="2">4.6 (0.7), 5</td>
                  <td colspan="2">4.4 (0.9), 5</td>
                  <td colspan="3">4.6 (0.9), 5</td>
                  <td colspan="2">4.7 (0.8), 5</td>
                  <td colspan="2">4.6 (0.9), 5</td>
                  <td colspan="2">4.0 (0.9), 4</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table4fn1">
                <p><sup>a</sup>All conversations (ie, dialogue and feedback) were rated for “final ratings” by 3 blinded human raters (ie, board-certified internal medicine physicians) and by GPT. Results are reported as unweighted mean (SD) and median across all conversations. A “conversation” refers to the VP-clinician dialogue plus feedback. Response options for all items ranged from 1=strongly disagree to 6=strongly agree. Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> reports additional rating subgroups (ie, by topic and clinician persona).</p>
              </fn>
              <fn id="table4fn2">
                <p><sup>b</sup>GPT: generative pretrained transformer.</p>
              </fn>
              <fn id="table4fn3">
                <p><sup>c</sup>“GPT-3.5” conversations used GPT-3.5-Turbo as the virtual patient (N=36 because these did not include 12 self-chat conversations). All other conversations used GPT-4.0-Turbo.</p>
              </fn>
              <fn id="table4fn4">
                <p><sup>d</sup>GPT did not rate user experience.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
          <p>We report final ratings subgroup comparisons in <xref ref-type="table" rid="table5">Table 5</xref>. Differences between topics were small. All ratings were higher for GPT-4.0-Turbo versus GPT-3.5-Turbo (ie, differences ranging from 0.17 to 0.71), although differences did not always reach statistical significance (as indicated by the 95% CIs). Conversations involving human clinicians had higher experience ratings than those with GPT as clinician (ie, differences ≥0.57) but similar authenticity (ie, differences ≤0.31) and—as would be expected—similar feedback ratings (ie, difference −0.05). Among human clinicians, the resident persona had higher ratings than the poor medical student, and these differences (≥0.48) were statistically significant for authenticity and experience. No instances of potential bias were identified during creation or final rating.</p>
          <table-wrap position="float" id="table5">
            <label>Table 5</label>
            <caption>
              <p>Final ratings of conversation quality: subgroup comparisons<sup>a</sup>.</p>
            </caption>
            <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
              <col width="160"/>
              <col width="160"/>
              <col width="170"/>
              <col width="160"/>
              <col width="200"/>
              <col width="150"/>
              <thead>
                <tr valign="top">
                  <td>Outcome</td>
                  <td>Topic: Diabetes vs cough (n=180), mean difference (95% CI)</td>
                  <td>GPT<sup>b</sup> model: 4.0 vs 3.5 (case 1; n=72), mean difference (95% CI)</td>
                  <td>Clinician: human vs GPT (n=180), mean difference (95% CI)</td>
                  <td>Clinician: resident vs medical student persona (n=144), mean difference (95% CI)<sup>c</sup></td>
                  <td>Rater: human vs GPT (n=360), mean difference (95% CI)</td>
                </tr>
              </thead>
              <tbody>
                <tr valign="top">
                  <td>Dialogue authenticity: overall</td>
                  <td>0.14 (−0.25 to 0.54)</td>
                  <td>0.42 (−0.19 to 1.02)</td>
                  <td>0.31 (−0.25 to 0.88)</td>
                  <td>0.69 (0.26 to 1.12)</td>
                  <td>−0.52 (−0.85 to −0.19)</td>
                </tr>
                <tr valign="top">
                  <td>Dialogue authenticity: humanlike<sup>d</sup></td>
                  <td>0.09 (−0.32 to 0.50)</td>
                  <td>0.50 (−0.16 to 1.16)</td>
                  <td>0.12 (−0.46 to 0.70)</td>
                  <td>0.71 (0.22 to 1.20)</td>
                  <td>−0.98 (−1.24 to −0.71)</td>
                </tr>
                <tr valign="top">
                  <td>User experience: overall</td>
                  <td>0.03 (−0.37 to 0.44)</td>
                  <td>0.17 (−0.39 to 0.72)</td>
                  <td>0.57 (0.04 to 1.11)</td>
                  <td>0.48 (0.07 to 0.88)</td>
                  <td>—<sup>e</sup></td>
                </tr>
                <tr valign="top">
                  <td>User experience: realness<sup>d</sup></td>
                  <td>0.18 (−0.28 to 0.63)</td>
                  <td>0.58 (−0.08 to 1.25)</td>
                  <td>0.69 (0.06 to 1.33)</td>
                  <td>0.75 (0.24 to 1.26)</td>
                  <td>—<sup>e</sup></td>
                </tr>
                <tr valign="top">
                  <td>Feedback: average</td>
                  <td>0.03 (−0.33 to 0.38)</td>
                  <td>0.71 (0.13 to 1.28)</td>
                  <td>−0.05 (−0.51 to 0.41)</td>
                  <td>0.17 (−0.24 to 0.59)</td>
                  <td>0.10 (−0.37 to 0.58)</td>
                </tr>
              </tbody>
            </table>
            <table-wrap-foot>
              <fn id="table5fn1">
                <p><sup>a</sup>All conversations (dialogue and feedback) were rated for “final ratings” by 3 blinded human raters (ie, board-certified internal medicine physicians) and by GPT. Results reported here reflect adjusted mean differences between groups accounting for repeated measures on conversations and Tukey-adjusted 95% CI. A “conversation” refers to the VP-clinician dialogue plus feedback. Conversations included in each analysis were matched according to design features; nonmatching conversations were excluded. Response options for all items ranged from 1=strongly disagree to 6=strongly agree.</p>
              </fn>
              <fn id="table5fn2">
                <p><sup>b</sup>GPT: generative pretrained transformer.</p>
              </fn>
              <fn id="table5fn3">
                <p><sup>c</sup>This contrast was selected for reporting post hoc, after the omnibus test across all human clinician personas revealed statistically significant differences (<italic>P</italic>≤.03) for all outcomes except feedback. None of the other pairwise contrasts among human-played personas reached statistical significance.</p>
              </fn>
              <fn id="table5fn4">
                <p><sup>d</sup>These outcomes were selected a priori for reporting because they closely aligned with the overarching study aim.</p>
              </fn>
              <fn id="table5fn5">
                <p><sup>e</sup>GPT did not rate user experience.</p>
              </fn>
            </table-wrap-foot>
          </table-wrap>
        </sec>
      </sec>
      <sec>
        <title>Features That Detracted From or Enhanced Authenticity</title>
        <p>We identified features that detracted from or enhanced conversation authenticity (<xref ref-type="table" rid="table6">Table 6</xref>). Across 180 dialogues, the most frequent detractors were that GPT was verbose or used atypical vocabulary (93/180, 51.6%), was overly agreeable (56/180, 31.1%), repeated the question as part of the response (47/180, 26.1%), was too easily convinced by clinician suggestions (35/180, 19.4%), or was not offended or confused by poor clinician performance (eg, jargon and poorly worded questions; 32/180, 17.8%). Enhancers included expressing an explicit preference or choice (ie, especially preferences contrary to the clinician’s initial suggestion, 106/180, 58.9%), expressing appropriate emotion (38/180, 21.1%), and notably natural speech (38/180, 21.1%).</p>
        <table-wrap position="float" id="table6">
          <label>Table 6</label>
          <caption>
            <p>Features that detracted from or enhanced virtual patient conversations.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="30"/>
            <col width="480"/>
            <col width="0"/>
            <col width="140"/>
            <col width="0"/>
            <col width="170"/>
            <col width="0"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="4">Feature<sup>a</sup></td>
                <td colspan="2">All (n=180), n (%)</td>
                <td colspan="2">Diabetes (n=90), n (%)</td>
                <td>Cough (n=90), n (%)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>Dialogue</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="8">
                  <bold>Detracted</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Responses reflect atypical word choice, verbose</td>
                <td colspan="2">93 (51.7)</td>
                <td colspan="2">50 (55.6)</td>
                <td colspan="2">43 (47.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Overly agreeable</td>
                <td colspan="2">56 (31.1)</td>
                <td colspan="2">35 (38.9)</td>
                <td colspan="2">21 (23.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Repeated question as part of response</td>
                <td colspan="2">47 (26.1)</td>
                <td colspan="2">16 (17.8)</td>
                <td colspan="2">31 (34.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Easily convinced or manipulated by clinician</td>
                <td colspan="2">35 (19.4)</td>
                <td colspan="2">23 (25.6)</td>
                <td colspan="2">12 (13.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Not offended or confused by poor clinician performance (including jargon)</td>
                <td colspan="2">32 (17.8)</td>
                <td colspan="2">20 (22.2)</td>
                <td colspan="2">12 (13.3)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Clinician dialogue was unrealistic</td>
                <td colspan="2">29 (16.1)</td>
                <td colspan="2">14 (15.6)</td>
                <td colspan="2">15 (16.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Volunteered too much information (without being asked)</td>
                <td colspan="2">28 (15.6)</td>
                <td colspan="2">15 (16.7)</td>
                <td colspan="2">13 (14.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Test ordering and reporting was unrealistic</td>
                <td colspan="2">23 (12.8)</td>
                <td colspan="2">1 (1.1)</td>
                <td colspan="2">22 (24.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Responses did not make sense</td>
                <td colspan="2">12 (6.7)</td>
                <td colspan="2">2 (2.2)</td>
                <td colspan="2">10 (11.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Offered excessive teaching support</td>
                <td colspan="2">10 (5.6)</td>
                <td colspan="2">4 (4.4)</td>
                <td colspan="2">6 (6.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Switched to playing role of doctor</td>
                <td colspan="2">6 (3.3)</td>
                <td colspan="2">0 (0)</td>
                <td colspan="2">6 (6.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="8">
                  <bold>Enhanced</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Expressed preference, challenged recommendations, made clear choice</td>
                <td colspan="2">106 (58.9)</td>
                <td colspan="2">57 (63.3)</td>
                <td colspan="2">49 (54.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Expressed appropriate emotion</td>
                <td colspan="2">40 (22.2)</td>
                <td colspan="2">23 (25.6)</td>
                <td colspan="2">17 (18.9)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Very natural flow; authentic word choice; fluent</td>
                <td colspan="2">38 (21.1)</td>
                <td colspan="2">24 (26.7)</td>
                <td colspan="2">14 (15.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Challenged clinician when vague or nonsensical</td>
                <td colspan="2">31 (17.2)</td>
                <td colspan="2">6 (6.7)</td>
                <td colspan="2">25 (27.8)</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Feedback</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="8">
                  <bold>Detracted</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Too positive or insufficient critique (relative to actual performance)</td>
                <td colspan="2">42 (23.3)</td>
                <td colspan="2">17 (18.9)</td>
                <td colspan="2">25 (27.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Omission: behavioral weakness or strength not mentioned</td>
                <td colspan="2">41 (22.8)</td>
                <td colspan="2">18 (20)</td>
                <td colspan="2">23 (25.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Inaccurate: “Omitted” behaviors really <italic>were</italic> done</td>
                <td colspan="2">39 (21.7)</td>
                <td colspan="2">19 (21.1)</td>
                <td colspan="2">20 (22.2)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Inaccurate: “Needed” behaviors really not needed</td>
                <td colspan="2">32 (17.8)</td>
                <td colspan="2">19 (21.1)</td>
                <td colspan="2">13 (14.4)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Too long, unrealistically detailed</td>
                <td colspan="2">24 (13.3)</td>
                <td colspan="2">9 (10)</td>
                <td colspan="2">15 (16.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Too negative or insufficient praise (relative to actual performance)</td>
                <td colspan="2">23 (12.8)</td>
                <td colspan="2">13 (14.4)</td>
                <td colspan="2">10 (11.1)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Inaccurate: “Observed” behaviors really not done</td>
                <td colspan="2">22 (12.2)</td>
                <td colspan="2">15 (16.7)</td>
                <td colspan="2">7 (7.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Too vague, brief</td>
                <td colspan="2">19 (10.6)</td>
                <td colspan="2">11 (12.2)</td>
                <td colspan="2">8 (8.9)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Omission: inappropriate treatment plan not mentioned</td>
                <td colspan="2">17 (9.4)</td>
                <td colspan="2">9 (10)</td>
                <td colspan="2">8 (8.9)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Inaccurate: a suggested clinical test or treatment not really needed</td>
                <td colspan="2">15 (8.3)</td>
                <td colspan="2">10 (11.1)</td>
                <td colspan="2">5 (5.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td colspan="8">
                  <bold>Enhanced</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Notably specific, actionable, constructive, accurate</td>
                <td colspan="2">75 (41.7)</td>
                <td colspan="2">41 (45.6)</td>
                <td colspan="2">34 (37.8)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Suggested notably useful clinical action</td>
                <td colspan="2">63 (35)</td>
                <td colspan="2">31 (34.4)</td>
                <td colspan="2">32 (35.6)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Identified notably or subtly good or bad behavior</td>
                <td colspan="2">46 (25.6)</td>
                <td colspan="2">22 (24.4)</td>
                <td colspan="2">24 (26.7)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Notably well justified or prioritized</td>
                <td colspan="2">31 (17.2)</td>
                <td colspan="2">14 (15.6)</td>
                <td colspan="2">17 (18.9)</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>Notably balanced; limited praise for poor performance</td>
                <td colspan="2">12 (6.7)</td>
                <td colspan="2">3 (3.3)</td>
                <td colspan="2">9 (10)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table6fn1">
              <p><sup>a</sup>We inductively iteratively developed a list of detracting and enhancing features throughout the process of conversation creation and final ratings, and each rater then independently marked the presence of each feature as it was noted.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>For feedback, detractors included excessively positive feedback relative to actual performance (42/180, 23.3%), failure to mention an important weakness or strength (41/180, 22.8%), inaccuracies due to claimed omissions that were actually done (39/180, 21.7%), or suggested behaviors that were not really needed (32/180, 17.8%). Enhancers included being notably specific or actionable (75/180, 41.7%), suggesting a useful clinical action (63/180, 35%), and recognizing a subtle aspect of clinician performance (46/180, 25.5%).</p>
      </sec>
      <sec>
        <title>Human Versus LLM Quality Ratings</title>
        <p>We used GPT-4.0-Turbo to rate each conversation 3 times, requiring 121,860 tokens (US $1.22) per run. GPT took 228 to 506 seconds to rate authenticity and 221 to 234 seconds to rate feedback for all conversations. In contrast with human ratings, between-replication variance in ratings approached 0, such that all nonfeature variance resulted from run-to-run inconsistencies in GPT ratings (Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The resulting ICCs (<xref ref-type="table" rid="table1">Table 1</xref>) were on par with those of human raters.</p>
        <p>In paired (ie. feature-matched) analyses, authenticity ratings (<xref ref-type="table" rid="table4">Table 4</xref>) were significantly lower (<xref ref-type="table" rid="table5">Table 5</xref>) for human-generated versus GPT-generated ratings (ie, −0.98 points for humanlike; −0.52 points overall), whereas feedback ratings were similar for both (ie, only 0.10 points higher).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study explored 4 applications of LLMs for clinical education: a low-cost, scalable LLM-powered interactive VP; LLM-generated feedback on clinician performance; LLM role-playing the clinician; and LLM-generated ratings of dialogue and feedback. This is the first study to empirically evaluate LLM-powered VPs, and the results are overall favorable. According to blinded human raters, VPs approached a “very good approximation of a real conversation” with “easily overlooked flaws,” and LLM-generated personalized feedback was nearly “on par with [feedback] from a trained human clinician-supervisor” (quoting operational criteria for rating=5, see Box S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Moreover, the VP demonstrably represented distinct patient preferences, including often expressing opinions that opposed clinician suggestions. LLM-as-clinician dialogues had authenticity ratings similar to human-as-clinician dialogues. LLM-generated ratings of feedback quality were similar to human ratings, whereas ratings of authenticity were much higher, which suggests inaccuracy. We also developed and validated instruments for rating dialogue authenticity, VP user experience, and feedback quality.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>The most salient limitation is suboptimal reproducibility of human ratings. Importantly, the high between-replication variances suggest that inconsistencies could come from real differences in GPT performance in simulating the “same” case. Indeed, conversation creators noted significant differences in GPT responses on the second replication. High variances could also indicate within-rater idiosyncrasies and inconsistencies, and refined operational criteria and improved rater training could mitigate this. Low reproducibility could further arise from restriction of range: we asked GPT to provide excellent feedback, and for the most part it delivered. Soliciting a wider range of performance (eg, including intentionally substandard feedback) might reveal higher agreement. We noted difficulty in rating long conversations, especially when problems manifest in only a small part of an otherwise satisfactory conversation. It might help to rate shorter texts, which could be generated by splitting the text into chunks based on word count or using AI to extract salient subtexts. User experience was difficult to rate from a written transcript; we surmise that rating user experience as it dynamically unfolds in written text, or viewing a recorded performance, would be more meaningful. Importantly, our analyses adjusted for within-rater correlation, which helps mitigate rater inconsistencies for the purposes of this study.</p>
        <p>GPT–generated ratings also had low reproducibility, but variance arose from run-to-run inconsistencies rather than replications. The data suggest that within a given analysis run, GPT assigns a similar rating level to all conversations; and on different runs it assigns different rating levels (ie, a different baseline). Providing training examples would likely improve consistency (ie, standardization).</p>
        <p>There are other limitations. We adjusted the operational criteria for ratings between conversation creation and final ratings, thus precluding a meaningful evaluation of intrarater test-retest reliability. These VPs used only written text; however, authenticity was high even with this limitation. Moreover, we note that much clinical work now occurs using text communication. Recently released LLMs now support live bidirectional audio and video. We implemented just 2 topics from outpatient internal medicine and a limited spectrum of patient preferences; however, our approach easily extends to other topics and contextualizing features. Finally, for this intrinsic evaluation study, the clinician role was played by study investigators rather than real learners; real-world performance will be investigated in future extrinsic evaluations.</p>
      </sec>
      <sec>
        <title>Implications</title>
        <p>We demonstrated proof of concept for scalable, globally accessible, and low-cost LLM-powered VPs. The unscripted, responsive dialogues contrast sharply with most existing VPs, for which authentic and flexible dialogue is notoriously difficult to replicate and often not attempted. Such authenticity will facilitate training, assessment, and research on shared decision-making [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref16">16</xref>] and other management reasoning processes [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Although patient preferences were not always perceivable, this parallels real life. A patient’s preferences will not surface in every patient-clinician encounter and often require elicitation by a skilled clinician [<xref ref-type="bibr" rid="ref67">67</xref>]. Accordingly, the LLM’s ability to perceptibly represent preferences is commendable. Using this LLM-powered approach, thousands of preference-sensitive VPs can be created with much higher efficiency, and potentially higher authenticity, than current labor-intensive methods. A VP is “created” as a 1-page document, and permutations are incorporated by changing a few sentences. Such permutations (ie, preferences, comorbidities, social determinants of health, and system constraints) will prove invaluable in training and assessing contextualized care [<xref ref-type="bibr" rid="ref17">17</xref>-<xref ref-type="bibr" rid="ref19">19</xref>].</p>
        <p>Our findings support the use of LLMs to deliver specific, actionable feedback to clinicians. This fills an important, long-recognized gap in clinical training [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref27">27</xref>]. Although LLM-generated feedback was not perfect, it was very good. If future research can improve feedback quality—perhaps using defined rubrics—it could support education across the continuum of clinician training and extending beyond VPs, including audio-recorded encounters involving simulated or real human patients and encompassing practicing physicians (eg, automated feedback on actual patient-clinician conversations for continuous professional development).</p>
        <p>Subgroup comparisons clarify nuanced understanding. GPT-4.0-Turbo outperformed GPT-3.5-Turbo in both dialogs and feedback, albeit at substantially greater cost. By contrast, the absence of differences in all other comparisons of feedback is expected and thus reassuring (ie, we would not expect feedback quality to differ by topic or persona). LLM-as-clinician dialogues generated a less realistic user experience even though dialogue authenticity was similar. Dialogues for the poor medical student persona had low ratings; we attribute this to failure of the LLM to respond appropriately to poor performance (eg, by volunteering information or not expressing confusion) and raters’ perception that the student’s performance was unnatural.</p>
        <p>We present evidence supporting the validity of scores from 3 instruments, rating dialogue authenticity, user experience, and feedback quality. Items were well grounded (ie, <italic>content</italic> evidence), and we confirmed expected <italic>relations with other variables</italic> (higher ratings for advanced LLM models and human clinician personas). Reproducibility (ie, <italic>internal structure</italic>) was suboptimal; however, our data suggest that inconsistencies arise, at least in part, from variation in LLM performance rather than rater idiosyncrasies. The data on features that detracted from or enhanced conversation quality provided evidence regarding investigators’ <italic>response processes</italic>, which largely align with the constructs embodied in the instrument items. We have suggested several steps that could improve reproducibility in future work.</p>
        <p>Zero-shot LLM-generated ratings were suboptimal. LLM feedback ratings were similar to pair-matched human-generated ratings, but reproducibility was low. Dialogue ratings were higher than humans’ and presumably inaccurate, perhaps because GPT was rating itself. We speculate that a different LLM might be more objective. Providing examples (eg, few-shot learning) may also be needed. We had reservations that GPT could provide meaningful ratings of user experience (ie, an innately human perception) and thus did not attempt this. Future research could explore this.</p>
        <p>Although LLMs are known to occasionally render biased responses, we did not detect any instances of bias in these conversations. We did encounter problems arising from rules built into GPT to <italic>prevent</italic> such responses: for example, when we tried to incorporate certain social determinants of health (such as race or income status), GPT would occasionally reject these as inappropriate—even though they were well-intentioned. We also built rules into our LLM prompt to identify and correct potentially biased statements from the clinician-user. We tested these during the prompt engineering phase, but not during formal conversation creation. We recommend ongoing attention to bias in future simulations.</p>
        <p>Our findings suggest additional avenues for research. All these innovations—the LLM-powered VPs, LLM-generated feedback, LLM-clinician, and LLM-generated ratings—would benefit from further-refined prompt engineering and iterative evaluation. We also wonder if performance might be improved using fine-tuned LLMs with health care conversations as training data. As we found, LLMs respond differently every time; this is a strength (eg, spontaneous and natural dialogue), but also a liability (eg, inconsistent conditions for assessment or training). What are the consequences of such variability, and how can variability be mitigated when needed (such as for standardized assessment)? VPs could help address or inadvertently propagate bias and stereotypes; this warrants ongoing attention.</p>
        <p>Finally, we note diverse potential applications of LLM-powered VPs, including clinical reasoning in other contexts (eg, inpatient and procedural settings), training nonclinicians (eg, nurses, therapists, pharmacists, and patients), education beyond clinical reasoning (ie, basic knowledge [through case-based learning], communication, teamwork, interprofessional education, tasks such as cognitive behavioral therapy or motivational interviewing, and socialization into the clinical role), and generating transcripts for research (eg, for studies comparing different feedback approaches). LLM-powered VPs could also help test clinical interventions (eg, novel workflows, informatics tools [software as a medical device], and AI innovations) or rehearse specific high-stakes scenarios (“digital twin”).</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Generative pretrained transformer (GPT) prompts, a representative conversation, details on instrument development and operational criteria for rating dialogue and feedback quality, planned case permutations, variance components for ratings, and ratings for specific study subgroups.</p>
        <media xlink:href="jmir_v27i1e68486_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 445 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">GPT</term>
          <def>
            <p>generative pretrained transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">ICC</term>
          <def>
            <p>intraclass correlation coefficient</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">VP</term>
          <def>
            <p>virtual patient</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>The authors thank Martin G Tolsgaard, PhD, DMSc (Copenhagen University Hospital and Copenhagen Academy for Medical Education and Simulation); Grace C Huang, MD (Harvard Medical School and Beth Israel Deaconess Medical Center); and David M Howcroft, PhD (Edinburgh Napier University) for their review and suggestions on the rating instruments.</p>
      <p>This study had no external funding. This work was funded in part by Mayo Clinic Department of Medicine, Division of General Internal Medicine, Rochester, MN. This organization (the primary investigator’s institution) had no role in planning, executing, or reporting this study. Generative artificial intelligence (large language models) played an integral role in the execution of this research. These tools played no role in the writing of the manuscript itself. We used DALL-E 3 to create the thumbnail image.</p>
    </ack>
    <notes>
      <title>Data Availability</title>
      <p>The case descriptions used in this study are provided in the online supplemental materials; these can be used with ChatGPT or the OpenAI GPT application programming interface. The Python code was published previously [<xref ref-type="bibr" rid="ref38">38</xref>]. The dataset of quality ratings is available from the corresponding author upon request within 12 months of publication.</p>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>DAC is responsible for all aspects of the study, including conceptualization, data curation, formal analysis, methodology, funding acquisition, project administration, and all phases of manuscript writing. VSP contributed to the formal analysis, methodology, and review and editing of the manuscript. All other authors contributed to the conceptualization, data curation, methodology, and review and editing of the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Newman-Toker</surname>
              <given-names>DE</given-names>
            </name>
            <name name-style="western">
              <surname>Pronovost</surname>
              <given-names>PJ</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic errors--the next frontier for patient safety</article-title>
          <source>JAMA</source>
          <year>2009</year>
          <month>03</month>
          <day>11</day>
          <volume>301</volume>
          <issue>10</issue>
          <fpage>1060</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2009.249</pub-id>
          <pub-id pub-id-type="medline">19278949</pub-id>
          <pub-id pub-id-type="pii">301/10/1060</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Norman</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Monteiro</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Sherbino</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ilgen</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Schmidt</surname>
              <given-names>HG</given-names>
            </name>
            <name name-style="western">
              <surname>Mamede</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The causes of errors in clinical reasoning: cognitive biases, knowledge deficits, and dual process thinking</article-title>
          <source>Acad Med</source>
          <year>2017</year>
          <month>01</month>
          <volume>92</volume>
          <issue>1</issue>
          <fpage>23</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000001421</pub-id>
          <pub-id pub-id-type="medline">27782919</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Owens</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Qaseem</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Shekelle</surname>
              <given-names>P</given-names>
            </name>
            <collab>Clinical Guidelines Committee of the American College of Physicians</collab>
          </person-group>
          <article-title>High-value, cost-conscious health care: concepts for clinicians to evaluate the benefits, harms, and costs of medical interventions</article-title>
          <source>Ann Intern Med</source>
          <year>2011</year>
          <month>03</month>
          <day>01</day>
          <volume>154</volume>
          <issue>3</issue>
          <fpage>174</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.acpjournals.org/doi/abs/10.7326/0003-4819-154-3-201102010-00007?url_ver=Z39.88-2003&amp;rfr_id=ori:rid:crossref.org&amp;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.7326/0003-4819-154-3-201102010-00007</pub-id>
          <pub-id pub-id-type="medline">21282697</pub-id>
          <pub-id pub-id-type="pii">154/3/174</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weinberger</surname>
              <given-names>SE</given-names>
            </name>
          </person-group>
          <article-title>Providing high-value, cost-conscious care: a critical seventh general competency for physicians</article-title>
          <source>Ann Intern Med</source>
          <year>2011</year>
          <month>09</month>
          <day>20</day>
          <volume>155</volume>
          <issue>6</issue>
          <fpage>386</fpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-155-6-201109200-00007</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eva</surname>
              <given-names>KW</given-names>
            </name>
          </person-group>
          <article-title>What every teacher needs to know about clinical reasoning</article-title>
          <source>Med Educ</source>
          <year>2005</year>
          <month>01</month>
          <volume>39</volume>
          <issue>1</issue>
          <fpage>98</fpage>
          <lpage>106</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2929.2004.01972.x</pub-id>
          <pub-id pub-id-type="medline">15612906</pub-id>
          <pub-id pub-id-type="pii">MED1972</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Norman</surname>
              <given-names>GR</given-names>
            </name>
            <name name-style="western">
              <surname>Eva</surname>
              <given-names>KW</given-names>
            </name>
          </person-group>
          <article-title>Diagnostic error and clinical reasoning</article-title>
          <source>Med Educ</source>
          <year>2010</year>
          <month>01</month>
          <volume>44</volume>
          <issue>1</issue>
          <fpage>94</fpage>
          <lpage>100</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2923.2009.03507.x</pub-id>
          <pub-id pub-id-type="medline">20078760</pub-id>
          <pub-id pub-id-type="pii">MED3507</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Triola</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>Virtual patients: a critical literature review and proposed next steps</article-title>
          <source>Med Educ</source>
          <year>2009</year>
          <month>04</month>
          <volume>43</volume>
          <issue>4</issue>
          <fpage>303</fpage>
          <lpage>11</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2923.2008.03286.x</pub-id>
          <pub-id pub-id-type="medline">19335571</pub-id>
          <pub-id pub-id-type="pii">MED3286</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Erwin</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Triola</surname>
              <given-names>MM</given-names>
            </name>
          </person-group>
          <article-title>Computerized virtual patients in health professions education: a systematic review and meta-analysis</article-title>
          <source>Acad Med</source>
          <year>2010</year>
          <volume>85</volume>
          <issue>10</issue>
          <fpage>1589</fpage>
          <lpage>602</lpage>
          <pub-id pub-id-type="doi">10.1097/acm.0b013e3181edfe13</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <collab>International Medical Device Regulatory Forum</collab>
          </person-group>
          <article-title>Software as a Medical Device (SAMD): clinical evaluation - guidance for industry and food and drug administration staff</article-title>
          <source>U.S. Department of Health and Human Services Food and Drug Administration</source>
          <year>2017</year>
          <access-date>2024-04-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.fda.gov/regulatory-information/search-fda-guidance-documents/software-medical-device-samd-clinical-evaluation">https://www.fda.gov/regulatory-information/search-fda-guidance-documents/software-medical-device-samd-clinical-evaluation</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sutton</surname>
              <given-names>RT</given-names>
            </name>
            <name name-style="western">
              <surname>Pincock</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Baumgart</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Sadowski</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Fedorak</surname>
              <given-names>RN</given-names>
            </name>
            <name name-style="western">
              <surname>Kroeker</surname>
              <given-names>KI</given-names>
            </name>
          </person-group>
          <article-title>An overview of clinical decision support systems: benefits, risks, and strategies for success</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <volume>3</volume>
          <fpage>17</fpage>
          <pub-id pub-id-type="doi">10.1038/s41746-020-0221-y</pub-id>
          <pub-id pub-id-type="medline">32047862</pub-id>
          <pub-id pub-id-type="pii">221</pub-id>
          <pub-id pub-id-type="pmcid">PMC7005290</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Sherbino</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Durning</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Management reasoning: beyond the diagnosis</article-title>
          <source>JAMA</source>
          <year>2018</year>
          <month>06</month>
          <day>12</day>
          <volume>319</volume>
          <issue>22</issue>
          <fpage>2267</fpage>
          <lpage>8</lpage>
          <pub-id pub-id-type="doi">10.1001/jama.2018.4385</pub-id>
          <pub-id pub-id-type="medline">29800012</pub-id>
          <pub-id pub-id-type="pii">2681495</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Durning</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Sherbino</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Gruppen</surname>
              <given-names>LD</given-names>
            </name>
          </person-group>
          <article-title>Management reasoning: implications for health professions educators and a research agenda</article-title>
          <source>Acad Med</source>
          <year>2019</year>
          <month>09</month>
          <volume>94</volume>
          <issue>9</issue>
          <fpage>1310</fpage>
          <lpage>6</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000002768</pub-id>
          <pub-id pub-id-type="medline">31460922</pub-id>
          <pub-id pub-id-type="pii">00001888-201909000-00019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Hargraves</surname>
              <given-names>IG</given-names>
            </name>
            <name name-style="western">
              <surname>Stephenson</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Durning</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Management reasoning and patient-clinician interactions: insights from shared decision-making and simulated outpatient encounters</article-title>
          <source>Med Teach</source>
          <year>2023</year>
          <month>09</month>
          <day>10</day>
          <volume>45</volume>
          <issue>9</issue>
          <fpage>1025</fpage>
          <lpage>37</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159X.2023.2170776</pub-id>
          <pub-id pub-id-type="medline">36763491</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Elwyn</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Durand</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Song</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Aarts</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Barr</surname>
              <given-names>PJ</given-names>
            </name>
            <name name-style="western">
              <surname>Berger</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cochran</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Frosch</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Galasiński</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Gulbrandsen</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Han</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Härter</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kinnersley</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lloyd</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Mishra</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Perestelo-Perez</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Scholl</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Tomori</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Trevena</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Witteman</surname>
              <given-names>HO</given-names>
            </name>
            <name name-style="western">
              <surname>Van der Weijden</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A three-talk model for shared decision making: multistage consultation process</article-title>
          <source>BMJ</source>
          <year>2017</year>
          <month>11</month>
          <day>06</day>
          <volume>359</volume>
          <fpage>j4891</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.bmj.com/lookup/pmidlookup?view=long&amp;pmid=29109079"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.j4891</pub-id>
          <pub-id pub-id-type="medline">29109079</pub-id>
          <pub-id pub-id-type="pmcid">PMC5683042</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bomhof-Roordink</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Gärtner</surname>
              <given-names>FR</given-names>
            </name>
            <name name-style="western">
              <surname>Stiggelbout</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Pieterse</surname>
              <given-names>AH</given-names>
            </name>
          </person-group>
          <article-title>Key components of shared decision making models: a systematic review</article-title>
          <source>BMJ Open</source>
          <year>2019</year>
          <month>12</month>
          <day>17</day>
          <volume>9</volume>
          <issue>12</issue>
          <fpage>e031763</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmjopen.bmj.com/lookup/pmidlookup?view=long&amp;pmid=31852700"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjopen-2019-031763</pub-id>
          <pub-id pub-id-type="medline">31852700</pub-id>
          <pub-id pub-id-type="pii">bmjopen-2019-031763</pub-id>
          <pub-id pub-id-type="pmcid">PMC6937101</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hargraves</surname>
              <given-names>IG</given-names>
            </name>
            <name name-style="western">
              <surname>Fournier</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Montori</surname>
              <given-names>VM</given-names>
            </name>
            <name name-style="western">
              <surname>Bierman</surname>
              <given-names>AS</given-names>
            </name>
          </person-group>
          <article-title>Generalized shared decision making approaches and patient problems. Adapting AHRQ's SHARE approach for purposeful SDM</article-title>
          <source>Patient Educ Couns</source>
          <year>2020</year>
          <month>10</month>
          <volume>103</volume>
          <issue>10</issue>
          <fpage>2192</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32636085"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.pec.2020.06.022</pub-id>
          <pub-id pub-id-type="medline">32636085</pub-id>
          <pub-id pub-id-type="pii">S0738-3991(20)30340-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC8142549</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Contextual errors in medical decision making: overlooked and understudied</article-title>
          <source>Acad Med</source>
          <year>2016</year>
          <month>05</month>
          <volume>91</volume>
          <issue>5</issue>
          <fpage>657</fpage>
          <lpage>62</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000001017</pub-id>
          <pub-id pub-id-type="medline">26630603</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Binns-Calvey</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ashley</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dayal</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weaver</surname>
              <given-names>FM</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Patient-centered decision making and health care outcomes: an observational study</article-title>
          <source>Ann Intern Med</source>
          <year>2013</year>
          <month>04</month>
          <day>16</day>
          <volume>158</volume>
          <issue>8</issue>
          <fpage>573</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-158-8-201304160-00001</pub-id>
          <pub-id pub-id-type="medline">23588745</pub-id>
          <pub-id pub-id-type="pii">1676452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weiner</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Schwartz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Weaver</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Goldberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yudkowsky</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sharma</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Binns-Calvey</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Preyss</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Schapira</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Persell</surname>
              <given-names>SD</given-names>
            </name>
            <name name-style="western">
              <surname>Jacobs</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Abrams</surname>
              <given-names>RI</given-names>
            </name>
          </person-group>
          <article-title>Contextual errors and failures in individualizing patient care: a multicenter study</article-title>
          <source>Ann Intern Med</source>
          <year>2010</year>
          <month>07</month>
          <day>20</day>
          <volume>153</volume>
          <issue>2</issue>
          <fpage>69</fpage>
          <lpage>75</lpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-153-2-201007200-00002</pub-id>
          <pub-id pub-id-type="medline">20643988</pub-id>
          <pub-id pub-id-type="pii">153/2/69</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Stephenson</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Gruppen</surname>
              <given-names>LD</given-names>
            </name>
            <name name-style="western">
              <surname>Durning</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Management reasoning: empirical determination of key features and a conceptual model</article-title>
          <source>Acad Med</source>
          <year>2023</year>
          <month>01</month>
          <day>01</day>
          <volume>98</volume>
          <issue>1</issue>
          <fpage>80</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000004810</pub-id>
          <pub-id pub-id-type="medline">35830267</pub-id>
          <pub-id pub-id-type="pii">00001888-202301000-00027</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Reynolds</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Candler</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Virtual patient simulation at US and Canadian medical schools</article-title>
          <source>Acad Med</source>
          <year>2007</year>
          <month>05</month>
          <volume>82</volume>
          <issue>5</issue>
          <fpage>446</fpage>
          <lpage>51</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0b013e31803e8a0a</pub-id>
          <pub-id pub-id-type="medline">17457063</pub-id>
          <pub-id pub-id-type="pii">00001888-200705000-00004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peddle</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bearman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nestel</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Virtual patients and nontechnical skills in undergraduate health professional education: an integrative review</article-title>
          <source>Clinical Simulation in Nursing</source>
          <year>2016</year>
          <month>09</month>
          <volume>12</volume>
          <issue>9</issue>
          <fpage>400</fpage>
          <lpage>10</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ecns.2016.04.004</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ende</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Feedback in clinical medical education</article-title>
          <source>JAMA</source>
          <year>1983</year>
          <month>08</month>
          <day>12</day>
          <volume>250</volume>
          <issue>6</issue>
          <fpage>777</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="medline">6876333</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dudek</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Marks</surname>
              <given-names>MB</given-names>
            </name>
            <name name-style="western">
              <surname>Wood</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>AC</given-names>
            </name>
          </person-group>
          <article-title>Assessing the quality of supervisors' completed clinical evaluation reports</article-title>
          <source>Med Educ</source>
          <year>2008</year>
          <month>08</month>
          <volume>42</volume>
          <issue>8</issue>
          <fpage>816</fpage>
          <lpage>22</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2923.2008.03105.x</pub-id>
          <pub-id pub-id-type="medline">18564093</pub-id>
          <pub-id pub-id-type="pii">MED3105</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Norcini</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Blank</surname>
              <given-names>LL</given-names>
            </name>
            <name name-style="western">
              <surname>Duffy</surname>
              <given-names>FD</given-names>
            </name>
            <name name-style="western">
              <surname>Fortna</surname>
              <given-names>GS</given-names>
            </name>
          </person-group>
          <article-title>The mini-CEX: a method for assessing clinical skills</article-title>
          <source>Ann Intern Med</source>
          <year>2003</year>
          <month>03</month>
          <day>18</day>
          <volume>138</volume>
          <issue>6</issue>
          <fpage>476</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="doi">10.7326/0003-4819-138-6-200303180-00012</pub-id>
          <pub-id pub-id-type="medline">12639081</pub-id>
          <pub-id pub-id-type="pii">200303180-00012</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Holmboe</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Yepes</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Williams</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Huot</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Feedback and the mini clinical evaluation exercise</article-title>
          <source>J Gen Intern Med</source>
          <year>2004</year>
          <month>05</month>
          <volume>19</volume>
          <issue>5 Pt 2</issue>
          <fpage>558</fpage>
          <lpage>61</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/15109324"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/j.1525-1497.2004.30134.x</pub-id>
          <pub-id pub-id-type="medline">15109324</pub-id>
          <pub-id pub-id-type="pii">JGI30134</pub-id>
          <pub-id pub-id-type="pmcid">PMC1492325</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fernando</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Cleland</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McKenzie</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Cassar</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Identifying the factors that determine feedback given to undergraduate medical students following formative mini-CEX assessments</article-title>
          <source>Med Educ</source>
          <year>2008</year>
          <month>01</month>
          <day>22</day>
          <volume>42</volume>
          <issue>1</issue>
          <fpage>89</fpage>
          <lpage>95</lpage>
          <pub-id pub-id-type="doi">10.1111/j.1365-2923.2007.02939.x</pub-id>
          <pub-id pub-id-type="medline">18034797</pub-id>
          <pub-id pub-id-type="pii">MED2939</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Kay</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jackson</surname>
              <given-names>WC</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The quality of written feedback by attendings of internal medicine residents</article-title>
          <source>J Gen Intern Med</source>
          <year>2015</year>
          <month>07</month>
          <volume>30</volume>
          <issue>7</issue>
          <fpage>973</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25691242"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-015-3237-2</pub-id>
          <pub-id pub-id-type="medline">25691242</pub-id>
          <pub-id pub-id-type="pmcid">PMC4471022</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marcotte</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Egan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Soleas</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Dalgarno</surname>
              <given-names>NJ</given-names>
            </name>
            <name name-style="western">
              <surname>Norris</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>Assessing the quality of feedback to general internal medicine residents in a competency-based environment</article-title>
          <source>Can Med Educ J</source>
          <year>2019</year>
          <month>11</month>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>e32</fpage>
          <lpage>47</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31807225"/>
          </comment>
          <pub-id pub-id-type="medline">31807225</pub-id>
          <pub-id pub-id-type="pmcid">PMC6892309</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Sebok-Syer</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Sampson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Monteiro</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>The quality of assessment of learning (Qual) score: validity evidence for a scoring system aimed at rating short, workplace-based comments on trainee performance</article-title>
          <source>Teach Learn Med</source>
          <year>2020</year>
          <month>02</month>
          <day>04</day>
          <volume>32</volume>
          <issue>3</issue>
          <fpage>319</fpage>
          <lpage>29</lpage>
          <pub-id pub-id-type="doi">10.1080/10401334.2019.1708365</pub-id>
          <pub-id pub-id-type="medline">32013584</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bower</surname>
              <given-names>JL</given-names>
            </name>
            <name name-style="western">
              <surname>Christensen</surname>
              <given-names>CM</given-names>
            </name>
          </person-group>
          <article-title>Disruptive technologies: catching the wave</article-title>
          <source>Harv Bus Rev</source>
          <year>1995</year>
          <month>01</month>
          <volume>13</volume>
          <issue>1</issue>
          <fpage>43</fpage>
          <lpage>53</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://hbr.org/1995/01/disruptive-technologies-catching-the-wave"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/0737-6782(96)81091-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilkinson</surname>
              <given-names>MD</given-names>
            </name>
            <name name-style="western">
              <surname>Dumontier</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aalbersberg</surname>
              <given-names>IJ</given-names>
            </name>
            <name name-style="western">
              <surname>Appleton</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Axton</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Baak</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Blomberg</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Boiten</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>da Silva Santos</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Bourne</surname>
              <given-names>PE</given-names>
            </name>
            <name name-style="western">
              <surname>Bouwman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Brookes</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Crosas</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dillo</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Dumon</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Edmunds</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Evelo</surname>
              <given-names>CT</given-names>
            </name>
            <name name-style="western">
              <surname>Finkers</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez-Beltran</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Groth</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Goble</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Grethe</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Heringa</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>'t Hoen</surname>
              <given-names>PA</given-names>
            </name>
            <name name-style="western">
              <surname>Hooft</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kuhn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Kok</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Kok</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lusher</surname>
              <given-names>SJ</given-names>
            </name>
            <name name-style="western">
              <surname>Martone</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Mons</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Packer</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Persson</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Rocca-Serra</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Roos</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van Schaik</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sansone</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schultes</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Sengstag</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Slater</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Strawn</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Swertz</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Thompson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>van der Lei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>van Mulligen</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Velterop</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Waagmeester</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wittenburg</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Wolstencroft</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mons</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>The FAIR Guiding Principles for scientific data management and stewardship</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>03</month>
          <day>15</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>160018</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.18"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.18</pub-id>
          <pub-id pub-id-type="medline">26978244</pub-id>
          <pub-id pub-id-type="pii">sdata201618</pub-id>
          <pub-id pub-id-type="pmcid">PMC4792175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dias</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Yule</surname>
              <given-names>SJ</given-names>
            </name>
          </person-group>
          <article-title>Using machine learning to assess physician competence: a systematic review</article-title>
          <source>Acad Med</source>
          <year>2019</year>
          <month>03</month>
          <volume>94</volume>
          <issue>3</issue>
          <fpage>427</fpage>
          <lpage>39</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000002414</pub-id>
          <pub-id pub-id-type="medline">30113364</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spickard 3rd</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ridinger</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wrenn</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>O'brien</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Shpigel</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Wolf</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Stein</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Denny</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Automatic scoring of medical students' clinical notes to monitor learning in the workplace</article-title>
          <source>Med Teach</source>
          <year>2014</year>
          <month>01</month>
          <day>07</day>
          <volume>36</volume>
          <issue>1</issue>
          <fpage>68</fpage>
          <lpage>72</lpage>
          <pub-id pub-id-type="doi">10.3109/0142159X.2013.849801</pub-id>
          <pub-id pub-id-type="medline">24195470</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cianciolo</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>LaVoie</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parker</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Machine scoring of medical students' written clinical reasoning: initial validity evidence</article-title>
          <source>Acad Med</source>
          <year>2021</year>
          <month>07</month>
          <day>01</day>
          <volume>96</volume>
          <issue>7</issue>
          <fpage>1026</fpage>
          <lpage>35</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33637657"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000004010</pub-id>
          <pub-id pub-id-type="medline">33637657</pub-id>
          <pub-id pub-id-type="pii">00001888-202107000-00050</pub-id>
          <pub-id pub-id-type="pmcid">PMC8243833</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Turner</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hashimoto</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Vasisht</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schaye</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Demystifying AI: current state and future role in medical education assessment</article-title>
          <source>Acad Med</source>
          <year>2024</year>
          <month>04</month>
          <day>01</day>
          <volume>99</volume>
          <issue>4S Suppl 1</issue>
          <fpage>S42</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000005598</pub-id>
          <pub-id pub-id-type="medline">38166201</pub-id>
          <pub-id pub-id-type="pii">00001888-202404001-00008</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bond</surname>
              <given-names>WF</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bhat</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>YS</given-names>
            </name>
            <name name-style="western">
              <surname>Ebert-Allen</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Ruger</surname>
              <given-names>RL</given-names>
            </name>
            <name name-style="western">
              <surname>Yudkowsky</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Automated patient note grading: examining scoring reliability and feasibility</article-title>
          <source>Acad Med</source>
          <year>2023</year>
          <month>11</month>
          <day>01</day>
          <volume>98</volume>
          <issue>11S</issue>
          <fpage>S90</fpage>
          <lpage>7</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000005357</pub-id>
          <pub-id pub-id-type="medline">37983401</pub-id>
          <pub-id pub-id-type="pii">00001888-202311001-00015</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
          </person-group>
          <article-title>Creating virtual patients using large language models: scalable, global, and low cost</article-title>
          <source>Med Teach</source>
          <year>2025</year>
          <month>01</month>
          <day>11</day>
          <volume>47</volume>
          <issue>1</issue>
          <fpage>40</fpage>
          <lpage>2</lpage>
          <pub-id pub-id-type="doi">10.1080/0142159X.2024.2376879</pub-id>
          <pub-id pub-id-type="medline">38992981</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <collab>American Educational Research Association</collab>
            <collab>American Psychological Association</collab>
            <collab>National Council on Measurement in Education.</collab>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Tong</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>De Los Reyes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Buckendahl</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Forte</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kuncel</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Laitusis</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Marquine</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Valdity</article-title>
          <source>The standards for educational and psychological testing</source>
          <year>2014</year>
          <publisher-loc>Washington, DC</publisher-loc>
          <publisher-name>American Educational Research Association</publisher-name>
          <fpage>11</fpage>
          <lpage>31</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cook</surname>
              <given-names>DA</given-names>
            </name>
            <name name-style="western">
              <surname>Beckman</surname>
              <given-names>TJ</given-names>
            </name>
          </person-group>
          <article-title>Current concepts in validity and reliability for psychometric instruments: theory and application</article-title>
          <source>Am J Med</source>
          <year>2006</year>
          <month>03</month>
          <volume>119</volume>
          <issue>2</issue>
          <fpage>166.e7</fpage>
          <lpage>16</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amjmed.2005.10.036</pub-id>
          <pub-id pub-id-type="medline">16443422</pub-id>
          <pub-id pub-id-type="pii">S0002-9343(05)01037-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Howcroft</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Belz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Clinciu</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Gkatzia</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Hasan</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Mahamood</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mille</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>van Miltenburg</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Santhanam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Rieser</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Twenty years of confusion in human evaluation: NLG needs evaluation sheets and standardised definitions</article-title>
          <source>Proceedings of the 13th International Conference on Natural Language Generation</source>
          <year>2020</year>
          <conf-name>INLG '20</conf-name>
          <conf-date>December 15-18, 2020</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>169</fpage>
          <lpage>82</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.inlg-1.23.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.inlg-1.23</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boureau</surname>
              <given-names>YL</given-names>
            </name>
            <name name-style="western">
              <surname>Weston</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bordes</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dinan</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Fan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gunning</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ju</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Poff</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ringshia</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Shuster</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>EM</given-names>
            </name>
          </person-group>
          <article-title>Open-domain conversational agents: current progress, open problems, and future directions</article-title>
          <source>arXiv</source>
          <fpage>2006.12442</fpage>
          <comment>Preprint posted online June 22, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2006.12442"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Adiwardana</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Luong</surname>
              <given-names>MT</given-names>
            </name>
            <name name-style="western">
              <surname>So</surname>
              <given-names>DR</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Fiedel</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Thoppilan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Kulshreshtha</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Nemade</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>QV</given-names>
            </name>
          </person-group>
          <article-title>Towards a human-like open-domain chatbot</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online January 27, 2020</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2001.09977"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>August</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Serrano</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Haduong</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Gururangan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>NA</given-names>
            </name>
          </person-group>
          <article-title>All that’s ‘human’ is not gold: evaluating human evaluation of generated text</article-title>
          <source>Proceedings of the 59th Annual Meeting of the Association for Computational Linguistics and the 11th International Joint Conference on Natural Language Processing</source>
          <year>2021</year>
          <conf-name>ACL-IJCNLP '21</conf-name>
          <conf-date>August 1-6, 2021</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>7282</fpage>
          <lpage>96</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.acl-long.565.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.acl-long.565</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Deriu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rodrigo</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Otegi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Echegoyen</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Rosset</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Agirre</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Cieliebak</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Survey on evaluation methods for dialogue systems</article-title>
          <source>Artif Intell Rev</source>
          <year>2021</year>
          <month>06</month>
          <day>25</day>
          <volume>54</volume>
          <issue>1</issue>
          <fpage>755</fpage>
          <lpage>810</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33505103"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s10462-020-09866-x</pub-id>
          <pub-id pub-id-type="medline">33505103</pub-id>
          <pub-id pub-id-type="pii">9866</pub-id>
          <pub-id pub-id-type="pmcid">PMC7817575</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Finch</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>Towards unified dialogue system evaluation: a comprehensive analysis of current evaluation protocols</article-title>
          <source>Proceedings of the 21th Annual Meeting of the Special Interest Group on Discourse and Dialogue</source>
          <year>2020</year>
          <conf-name>SIGDIAL '20</conf-name>
          <conf-date>July 1-3, 2020</conf-date>
          <conf-loc>Virtual event</conf-loc>
          <fpage>236</fpage>
          <lpage>45</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.sigdial-1.29.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.sigdial-1.29</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zellers</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Holtzman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Qin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Farhadi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Choi</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>TuringAdvice: a generative and dynamic evaluation of language use</article-title>
          <source>Proceedings of the 2021 Conference of the North American Chapter of the Association for Computational Linguistics: Human Language Technologies</source>
          <year>2021</year>
          <conf-name>NAACL '21</conf-name>
          <conf-date>June 6-11, 2021</conf-date>
          <conf-loc>Virtual event</conf-loc>
          <fpage>4856</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.naacl-main.386.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.naacl-main.386</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>CW</given-names>
            </name>
            <name name-style="western">
              <surname>Lowe</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Serban</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Noseworthy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Charlin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Pineau</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>How NOT to evaluate your dialogue system: an empirical study of unsupervised evaluation metrics for dialogue response generation</article-title>
          <source>Proceedings of the 2016 Conference on Empirical Methods in Natural Language Processing</source>
          <year>2016</year>
          <conf-name>EMNLP '16</conf-name>
          <conf-date>November 1-5, 2016</conf-date>
          <conf-loc>Austin, TX</conf-loc>
          <fpage>2122</fpage>
          <lpage>32</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/D16-1230.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/d16-1230</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Hsu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Roller</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boureau</surname>
              <given-names>YL</given-names>
            </name>
            <name name-style="western">
              <surname>Weston</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Human evaluation of conversations is an open problem: comparing the sensitivity of various methods for evaluating dialogue agents</article-title>
          <source>Proceedings of the 4th Workshop on NLP for Conversational AI</source>
          <year>2022</year>
          <conf-name>NLP4ConvAI '22</conf-name>
          <conf-date>May 27, 2022</conf-date>
          <conf-loc>Dublin, Ireland</conf-loc>
          <fpage>77</fpage>
          <lpage>97</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2022.nlp4convai-1.8.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2022.nlp4convai-1.8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yeh</surname>
              <given-names>YT</given-names>
            </name>
            <name name-style="western">
              <surname>Eskenazi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mehri</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive assessment of dialog evaluation metrics</article-title>
          <source>Proceedings of the 1st Workshop on Evaluations and Assessments of Neural Conversation Systems</source>
          <year>2021</year>
          <conf-name>WEANCS '21</conf-name>
          <conf-date>November 11, 2021</conf-date>
          <conf-loc>Virtual event</conf-loc>
          <fpage>15</fpage>
          <lpage>33</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2021.eancs-1.3.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2021.eancs-1.3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van der Lee</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gatt</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>van Miltenburg</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Wubben</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Krahmer</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Best practices for the human evaluation of automatically generated text</article-title>
          <source>Proceedings of the 12th International Conference on Natural Language Generation</source>
          <year>2019</year>
          <conf-name>INLG '19</conf-name>
          <conf-date>October 28-November 1, 2019</conf-date>
          <conf-loc>Tokyo, Japan</conf-loc>
          <fpage>355</fpage>
          <lpage>68</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/W19-8643.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/w19-8643</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huwendiek</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>De Leng</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Kononowicz</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Kunzmann</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Muijtjens</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Van Der Vleuten</surname>
              <given-names>CP</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffmann</surname>
              <given-names>GF</given-names>
            </name>
            <name name-style="western">
              <surname>Tönshoff</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dolmans</surname>
              <given-names>DH</given-names>
            </name>
          </person-group>
          <article-title>Exploring the validity and reliability of a questionnaire for evaluating virtual patient design with a special emphasis on fostering clinical reasoning</article-title>
          <source>Med Teach</source>
          <year>2015</year>
          <month>08</month>
          <day>14</day>
          <volume>37</volume>
          <issue>8</issue>
          <fpage>775</fpage>
          <lpage>82</lpage>
          <pub-id pub-id-type="doi">10.3109/0142159X.2014.970622</pub-id>
          <pub-id pub-id-type="medline">25313931</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Peddle</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bearman</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mckenna</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Nestel</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Exploring undergraduate nursing student interactions with virtual patients to develop 'non-technical skills' through case study methodology</article-title>
          <source>Adv Simul (Lond)</source>
          <year>2019</year>
          <month>2</month>
          <day>13</day>
          <volume>4</volume>
          <issue>1</issue>
          <fpage>2</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://advancesinsimulation.biomedcentral.com/articles/10.1186/s41077-019-0088-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s41077-019-0088-7</pub-id>
          <pub-id pub-id-type="medline">30805205</pub-id>
          <pub-id pub-id-type="pii">88</pub-id>
          <pub-id pub-id-type="pmcid">PMC6373120</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Schubert</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Friedmann</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Regenbrecht</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>The experience of presence: factor analytic insights</article-title>
          <source>Presence Teleoperators Virtual Environ</source>
          <year>2001</year>
          <month>06</month>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>266</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="doi">10.1162/105474601300343603</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Fink</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Reitmeier</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Stadler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Siebeck</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fischer</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Fischer</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Assessment of diagnostic competences with standardized patients versus virtual patients: experimental study in the context of history taking</article-title>
          <source>J Med Internet Res</source>
          <year>2021</year>
          <month>03</month>
          <day>04</day>
          <volume>23</volume>
          <issue>3</issue>
          <fpage>e21196</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2021/3/e21196/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/21196</pub-id>
          <pub-id pub-id-type="medline">33661122</pub-id>
          <pub-id pub-id-type="pii">v23i3e21196</pub-id>
          <pub-id pub-id-type="pmcid">PMC7974754</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Clement</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Oswald</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ghosh</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Exploring the quality of feedback in entrustable professional activity narratives across 24 residency training programs</article-title>
          <source>J Grad Med Educ</source>
          <year>2024</year>
          <volume>16</volume>
          <fpage>23</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.4300/jgme-d-23-00210.1</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>McGuire</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Acai</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sonnadara</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <article-title>The McMaster narrative comment rating tool: development and initial validity evidence</article-title>
          <source>Teach Learn Med</source>
          <year>2025</year>
          <month>01</month>
          <volume>37</volume>
          <issue>1</issue>
          <fpage>86</fpage>
          <lpage>98</lpage>
          <pub-id pub-id-type="doi">10.1080/10401334.2023.2276799</pub-id>
          <pub-id pub-id-type="medline">37964518</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zelenski</surname>
              <given-names>AB</given-names>
            </name>
            <name name-style="western">
              <surname>Tischendorf</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>Kessler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Saunders</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>MacDonald</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Vogelman</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Zakowski</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Beyond "read more": an intervention to improve faculty written feedback to learners</article-title>
          <source>J Grad Med Educ</source>
          <year>2019</year>
          <month>08</month>
          <volume>11</volume>
          <issue>4</issue>
          <fpage>468</fpage>
          <lpage>71</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31440343"/>
          </comment>
          <pub-id pub-id-type="doi">10.4300/JGME-D-19-00058.1</pub-id>
          <pub-id pub-id-type="medline">31440343</pub-id>
          <pub-id pub-id-type="pii">Customer: JGME-D-19-00058</pub-id>
          <pub-id pub-id-type="pmcid">PMC6699542</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Van Ostaeyen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Embo</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rotsaert</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>De Clercq</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Schellens</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Valcke</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A qualitative textual analysis of feedback comments in eportfolios: quality and alignment with the CanMEDS roles</article-title>
          <source>Perspect Med Educ</source>
          <year>2023</year>
          <month>12</month>
          <day>22</day>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>584</fpage>
          <lpage>93</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/38144672"/>
          </comment>
          <pub-id pub-id-type="doi">10.5334/pme.1050</pub-id>
          <pub-id pub-id-type="medline">38144672</pub-id>
          <pub-id pub-id-type="pmcid">PMC10742175</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gin</surname>
              <given-names>BC</given-names>
            </name>
            <name name-style="western">
              <surname>Ten Cate</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>O'Sullivan</surname>
              <given-names>PS</given-names>
            </name>
            <name name-style="western">
              <surname>Hauer</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Boscardin</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Exploring how feedback reflects entrustment decisions using artificial intelligence</article-title>
          <source>Med Educ</source>
          <year>2022</year>
          <month>03</month>
          <volume>56</volume>
          <issue>3</issue>
          <fpage>303</fpage>
          <lpage>11</lpage>
          <pub-id pub-id-type="doi">10.1111/medu.14696</pub-id>
          <pub-id pub-id-type="medline">34773415</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Spadafore</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yilmaz</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Rally</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chan</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Russell</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Thoma</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Monteiro</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Pardhan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Martin</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Monrad</surname>
              <given-names>SU</given-names>
            </name>
            <name name-style="western">
              <surname>Woods</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Using natural language processing to evaluate the quality of supervisor narrative comments in competency-based medical education</article-title>
          <source>Acad Med</source>
          <year>2024</year>
          <month>05</month>
          <day>01</day>
          <volume>99</volume>
          <issue>5</issue>
          <fpage>534</fpage>
          <lpage>40</lpage>
          <pub-id pub-id-type="doi">10.1097/ACM.0000000000005634</pub-id>
          <pub-id pub-id-type="medline">38232079</pub-id>
          <pub-id pub-id-type="pii">00001888-202405000-00019</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dev</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sheng</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Amstutz</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hou</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Sanseverino</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Nishi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>KW</given-names>
            </name>
          </person-group>
          <article-title>On measures of biases and harms in NLP</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online August 7, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2108.03362"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2022.findings-aacl.24</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hovy</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Prabhumoye</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Five sources of bias in natural language processing</article-title>
          <source>Lang Linguist Compass</source>
          <year>2021</year>
          <month>08</month>
          <day>20</day>
          <volume>15</volume>
          <issue>8</issue>
          <fpage>e12432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/35864931"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/lnc3.12432</pub-id>
          <pub-id pub-id-type="medline">35864931</pub-id>
          <pub-id pub-id-type="pii">LNC312432</pub-id>
          <pub-id pub-id-type="pmcid">PMC9285808</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Navigli</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Conia</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ross</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Biases in large language models: origins, inventory, and discussion</article-title>
          <source>ACM J Data Inf Qual</source>
          <year>2023</year>
          <month>06</month>
          <day>22</day>
          <volume>15</volume>
          <issue>2:article 10</issue>
          <fpage>1</fpage>
          <lpage>21</lpage>
          <pub-id pub-id-type="doi">10.1145/3597307</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Blodgett</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Barocas</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Daumé III</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Wallach</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Language (technology) is power: a critical survey of “bias” in NLP</article-title>
          <source>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2020</year>
          <conf-name>ACL '20</conf-name>
          <conf-date>July 5-10, 2020</conf-date>
          <conf-loc>Virtual event</conf-loc>
          <fpage>5454</fpage>
          <lpage>76</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.acl-main.485.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.485</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Landis</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Koch</surname>
              <given-names>GG</given-names>
            </name>
          </person-group>
          <article-title>The measurement of observer agreement for categorical data</article-title>
          <source>Biometrics</source>
          <year>1977</year>
          <month>03</month>
          <volume>33</volume>
          <issue>1</issue>
          <fpage>159</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.2307/2529310</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>YK</given-names>
            </name>
            <name name-style="western">
              <surname>Low</surname>
              <given-names>WY</given-names>
            </name>
            <name name-style="western">
              <surname>Ng</surname>
              <given-names>CJ</given-names>
            </name>
          </person-group>
          <article-title>Exploring patient values in medical decision making: a qualitative study</article-title>
          <source>PLoS One</source>
          <year>2013</year>
          <month>11</month>
          <day>25</day>
          <volume>8</volume>
          <issue>11</issue>
          <fpage>e80051</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0080051"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0080051</pub-id>
          <pub-id pub-id-type="medline">24282518</pub-id>
          <pub-id pub-id-type="pii">PONE-D-13-18276</pub-id>
          <pub-id pub-id-type="pmcid">PMC3839918</pub-id>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
