<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e84747</article-id><article-id pub-id-type="doi">10.2196/84747</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>A Large Language Model&#x2013;Powered Multiagent Framework Emulating Standardized Patients in Clinical Communication Skills Training: Development and Evaluation Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Qu</surname><given-names>Yufei</given-names></name><degrees>BEng</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Xu</surname><given-names>Xiaowei</given-names></name><degrees>MS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Long</surname><given-names>Yunzi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Yijie</given-names></name><degrees>BEng</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Li</surname><given-names>Jiao</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Lu</surname><given-names>Xudong</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>College of Biomedical Engineering and Instrument Science, Zhejiang University</institution><addr-line>No. 38 Zheda Road</addr-line><addr-line>Hangzhou</addr-line><country>China</country></aff><aff id="aff2"><institution>Institute of Medical Information/Library, Chinese Academy of Medical Sciences and Peking Union Medical College</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Cariology and Endodontology, Peking University School and Hospital of Stomatology and National Center for Stomatology and National Clinical Research Center for Oral Diseases and National Engineering Research Center of Oral Biomaterials and Digital Medical Devices and Beijing Key Laboratory</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff4"><institution>Department of General Dentistry II, Peking University School and Hospital of Stomatology</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff5"><institution>Hangzhou Joyrun Medical Science and Technology co., LTD</institution><addr-line>Hangzhou</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Grilo</surname><given-names>Ana</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Huang</surname><given-names>Ping</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Sun</surname><given-names>Zhoujian</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Xudong Lu, PhD, College of Biomedical Engineering and Instrument Science, Zhejiang University, No. 38 Zheda Road, Hangzhou, 310058, China; <email>lvxd@zju.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>4</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e84747</elocation-id><history><date date-type="received"><day>28</day><month>09</month><year>2025</year></date><date date-type="rev-recd"><day>19</day><month>04</month><year>2026</year></date><date date-type="accepted"><day>20</day><month>04</month><year>2026</year></date></history><copyright-statement>&#x00A9; Yufei Qu, Xiaowei Xu, Yunzi Long, Yijie Wang, Jiao Li, Xudong Lu. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 4.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e84747"/><abstract><sec><title>Background</title><p>Effective clinical communication is essential for medical practice, with standardized patients (SPs) being a reliable standard training method despite resource limitations. While large language models (LLMs) show strong role-playing abilities, current virtual patients (VPs) based on single LLMs face fidelity and interaction challenges. Recent advances in multiagent frameworks, which have demonstrated considerable potential in handling complex tasks, offer a new perspective for creating VPs in medical education.</p></sec><sec><title>Objective</title><p>This study aimed to develop and evaluate a novel multiagent VP framework that simulates SPs through a collaborative agent design, thereby enhancing human-like fidelity and interaction performance in clinical communication training&#x2013;oriented VP simulation.</p></sec><sec sec-type="methods"><title>Methods</title><p>Our multiagent framework constructed 5 specialized subagents by simulating the functional partitioning of brain regions, collaboratively simulating the entire process, from case reception to interactive consultation scenarios, designed for medical students. To enhance the interaction performance of VPs, we incorporated retrieval-augmented technology, while deep character reasoning was used to improve response richness and realism. We evaluated the proposed framework through a 2-phase experiment in which the metrics of response quality, role-playing performance, interaction efficiency, information accumulation, and perceived educational utility were applied consistently: first, to compare different base models, and second, to benchmark the complete framework against a single-LLM baseline.</p></sec><sec sec-type="results"><title>Results</title><p>The multiagent framework outperformed single-LLM baselines across multiple evaluation settings, achieving high information accuracy and role-playing scores under standardized dialogue conditions. Specifically, the GPT-4o&#x2013;based implementation achieved peak factual consistency of 0.769 (SD 0.04), while all configurations maintained &#x003E;94% clinical accuracy. The Qwen3-32B&#x2013;based framework achieved the lowest misleading rate of 1.28% (SD 1.20), compared to 4.72% (SD 1.53%) for single-LLM scoring. In assessments using standard dialogue scripts, the Qwen3-32B&#x2013;based framework attained the highest role-playing competency score of 39.67 (SD 0.71) and received high expert praise. However, limited discriminative power against specific leading questions on low-quality inquiries indicated that while these findings specifically establish high fidelity under structured conditions, further adaptation is required for authentic student interactions. Interaction efficiency remained practical with acceptable latency (~3 s) based on Qwen3-32B while maintaining a stable information pace during multiturn dialogues. Furthermore, a preliminary exploration of factual consistency and role-playing ability across 5 clinical departments demonstrated potential scalability.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The multiagent framework offers a viable simulation of SPs through the coordinated interaction of multiple LLM-based agents. This approach enhances the performance of VP simulation, providing a customizable and scalable solution for medical communication training, without compromising patient confidentiality. The framework holds substantial potential for advancing medical education approaches.</p></sec></abstract><kwd-group><kwd>virtual patient</kwd><kwd>large language models</kwd><kwd>multiagent</kwd><kwd>medical education</kwd><kwd>communication skills</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>In the medical field, proficient clinical communication skills, encompassing medical history taking, physical examination, diagnosis, and decision-making processes, constitute fundamental competencies for clinical practice [<xref ref-type="bibr" rid="ref1">1</xref>]. Strong communication skills enable physicians to obtain accurate information during diagnosis and treatment, thereby increasing diagnostic quality [<xref ref-type="bibr" rid="ref2">2</xref>] and establishing effective physician-patient relationships [<xref ref-type="bibr" rid="ref3">3</xref>]. These demands place increasing emphasis on the development of medical students&#x2019; clinical abilities.</p><p>To enhance clinical communication skills, medical education uses a variety of approaches, such as didactic lectures [<xref ref-type="bibr" rid="ref4">4</xref>], feedback [<xref ref-type="bibr" rid="ref5">5</xref>], standard curriculum [<xref ref-type="bibr" rid="ref6">6</xref>], role-play [<xref ref-type="bibr" rid="ref7">7</xref>], and standardized patients (SPs) [<xref ref-type="bibr" rid="ref8">8</xref>], as well as digital strategies [<xref ref-type="bibr" rid="ref9">9</xref>], including online modules, virtual patient (VP) simulations, and blended digital education. Among these, SPs, trained actors simulating real patients, are recognized as one of the most effective and widely adopted methods [<xref ref-type="bibr" rid="ref10">10</xref>]. However, SP-based training faces significant challenges, including high resource requirements for recruitment, training, and scenario design [<xref ref-type="bibr" rid="ref11">11</xref>], as well as SPs&#x2019; anxiety and fatigue. Owing to these limitations, traditional teaching methods are still predominantly used in instruction [<xref ref-type="bibr" rid="ref12">12</xref>], which remain insufficient for strengthening clinical competencies [<xref ref-type="bibr" rid="ref13">13</xref>].</p></sec><sec id="s1-2"><title>Prior Work</title><p>Recent advances in large language models (LLMs) have unlocked unprecedented capabilities in contextual interaction [<xref ref-type="bibr" rid="ref14">14</xref>], dynamic role-playing [<xref ref-type="bibr" rid="ref15">15</xref>], and clinical reasoning [<xref ref-type="bibr" rid="ref16">16</xref>], establishing a technological foundation for developing VPs as scalable alternatives to SPs [<xref ref-type="bibr" rid="ref17">17</xref>]. The construction of patient simulators has been extensively explored for both training and evaluating clinical LLMs. In the training domain, systems such as AMIE [<xref ref-type="bibr" rid="ref18">18</xref>] leverage patient agents derived from structured literature profiles to optimize performance via self-play and chain-of-reasoning, which prioritize authenticity, relevance, and fidelity to generate robust fine-tuning consultation dialogue datasets. In the evaluation domain, patient simulators serve as benchmarking frameworks to assess diagnostic accuracy in simulated clinical environments [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>]. Notably, the Baichuan framework [<xref ref-type="bibr" rid="ref21">21</xref>] enhances the patient simulator reliability by integrating rule-based judgment, dynamic prompting, and real-time correction modules to ensure output validity. Furthermore, research on VPs tailored for student education primarily prioritizes the responsiveness to medical inquiries and the interactive conversational experience. Initial explorations [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref24">24</xref>] using prompt engineering and visual augmentation have demonstrated the feasibility of LLM-based VPs. However, single-LLM implementations still suffer from limited fidelity in persona simulation [<xref ref-type="bibr" rid="ref25">25</xref>] and persistent hallucination issues [<xref ref-type="bibr" rid="ref26">26</xref>]. Beyond conventional optimization strategies such as fine-tuning [<xref ref-type="bibr" rid="ref15">15</xref>] and reinforcement learning [<xref ref-type="bibr" rid="ref27">27</xref>], recent studies increasingly explored multiagent architectures, which have proposed various organizational frameworks for multiagent systems (MAS), including flat, hierarchical, holonic, coalition-based, team-oriented, matrix, and congregation structures [<xref ref-type="bibr" rid="ref28">28</xref>], enhancing LLM performance through agent collaboration and complex task decomposition [<xref ref-type="bibr" rid="ref29">29</xref>].</p><p>Although research on multiagent VPs remains limited, notable efforts include SimPatient, which uses 3 specialized subagents (patient response generation, behavior coding, and cognitive modeling) to strengthen clinical authenticity in patient interactions [<xref ref-type="bibr" rid="ref30">30</xref>], and Yu et al [<xref ref-type="bibr" rid="ref31">31</xref>], who introduced a retrieval-augmented methodology based on a knowledge graph, enabling subagents to collaboratively perform retrieval tasks and improve response accuracy in clinical reporting. Despite these advances, current solutions remain inadequate for large-scale educational adoption due to insufficient system flexibility [<xref ref-type="bibr" rid="ref11">11</xref>] and lack of robust evaluation metrics, particularly those assessing role-playing fidelity [<xref ref-type="bibr" rid="ref32">32</xref>]. This underscores the need for further research on VP systems tailored for clinical communication training, with a focus on accurate clinical information delivery, anthropomorphic authenticity, and scalability.</p></sec><sec id="s1-3"><title>Goal of This Study</title><p>The aim of this study was to develop and evaluate a multiagent VP framework for clinical communication training. The proposed framework is designed to improve simulation fidelity and validity, interaction performance, and scalability of VP simulations and to systematically examine whether a multiagent architecture provides advantages over single-LLM approaches in SP emulation.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overall Study Design</title><p>This study aimed to develop and evaluate a multiagent framework for VPs (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The research pipeline comprised the following steps. First, clinical case reports were collected from real-world medical records and publicly available datasets to build the VP system. Subsequently, a multiagent framework with 5 subagents was designed to simulate SPs in clinical communication training scenarios, using the collected case reports. Finally, a comprehensive evaluation was conducted to select a relatively well-rounded base model for the framework and to assess the performance of the MAS in comparison with a single-LLM approach.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overview of the study design, including the report repository, multiagent framework, and evaluation process.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>Ethical approval for the study was granted by the Biomedical Institutional Review Board of the Peking University School and Hospital of Stomatology (PKUSSIRB-2025115200). Participants were informed about their involvement in this research, and all procedures strictly adhered to ethical standards outlined in the Declaration of Helsinki.</p></sec><sec id="s2-3"><title>Clinical Case Repository</title><p>Our clinical case repository included real-world cases and public medical datasets: (1) the CMB-Chinese Medical Benchmark dataset [<xref ref-type="bibr" rid="ref33">33</xref>], which consists of 74 complex medical consultation cases from multiple departments, containing chief complaints, present illness, medical history, and examination results, all with personal identifiers removed; and (2) 40 real-world dental cases collected from Peking University School and Hospital of Stomatology, modified to include standardized information such as chief complaints, present illness, medical history, examination findings, and diagnoses, in line with SP teaching requirements.</p></sec><sec id="s2-4"><title>Multiagent System</title><p>On the basis of the involvement of distinct functional brain regions in the educational process of SPs (including script preparation, script learning, feedback practice, and interaction), such as the prefrontal cortex, superior frontal cortex, temporal cortex, and brainstem, we mapped these functions onto specialized subagents responsible for managing different aspects of the VP interaction. Building on this foundation, we designed subagents aimed at enhancing the role-playing performance and scalability of existing methods through the refined functional division and collaborative operation of agents. Our framework integrates 5 specialized agents to support SP simulation using a bio-inspired architecture where each agent is functionally mapped to specific brain regions: the character mapping agent embodies role traits and sustains motivational states, reflecting emotion processing in the prefrontal cortex and state maintenance in the limbic system; the memory management agent, analogous to the hippocampal-prefrontal network, enables contextual encoding, consolidation, and retrieval; the information processing agent supports cognitive flexibility and query interpretation, mimicking the adaptive functions of the prefrontal cortex and temporal lobe; the language generation agent corresponds to cerebral language comprehension and production areas to formulate coherent responses; and the rethinking agent is modeled after the integrative function within the prefrontal network, which supports performance monitoring, error detection, and behavioral adaptation.</p><p>Building on these preliminary results, our study proposes a cerebral cortex&#x2013;inspired MAS framework designed to simulate SP clinical training behaviors, as shown in <xref ref-type="fig" rid="figure2">Figure 2</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Our multiagent framework uses a 3-stage framework comprising patient construction, diagnostic reasoning, and response generation. Through simulated multiround interactions mimicking brain regions, the system (1) identifies patient information relevant to queries in case reports, (2) generates SP responses, and (3) checks the responses. The framework incorporates persistent memory mechanisms to ensure dialogue coherence across conversational turns.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig02.png"/></fig><p>The framework operates with 3 external inputs: one of the clinical case reports in the repository, natural language queries from users, and instructional requirements from medical teachers. All subagents rely on an LLM core for tasks such as retrieval, generation, and reflection. The multiagent framework has been specifically optimized for medical education. The agent-based collaborative workflow consists of (1) a character mapping agent simulating patients&#x2019; emotional responses and character traits, (2) a memory management agent storing and retrieving case information during consultation, (3) an information processing agent evaluating students&#x2019; diagnostic questions and generating appropriate dynamic prompts, (4) a language generation agent producing clinically accurate responses, and (5) rethinking agents assessing whether responses satisfy SP standards.</p><p><italic>The character mapping agent</italic> generates personalized VPs by drawing inspiration from neurocognitive processes, including prefrontal cortex&#x2013;mediated emotional regulation, limbic system-driven motivational states, and affective responses. To protect data privacy, all patient case reports are anonymized before being processed by the LLM to synthesize multidimensional characteristic profiles. The agent uses a hierarchical prompt architecture in which the shallow instruction layer constructs personality profiles according to the Five-Factor Model [<xref ref-type="bibr" rid="ref34">34</xref>], while concurrently processing medical records and teacher inputs to generate associated demographic markers. Specifically, teacher input refers to the patient attributes specified by the teacher, including age, personality, and communication barriers, for the generation of a customized VP. Subsequently, the deep instruction layer operationalizes these personality dimensions into explicit behavioral protocols, systematically defining latent objectives, response thresholds, and psychological defense mechanisms as specified in <xref ref-type="fig" rid="figure3">Figure 3</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Schematic of the hierarchical prompt design for the character mapping agent.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig03.png"/></fig><p><italic>The memory management agent</italic> uses a dual-module architecture, as shown in <xref ref-type="fig" rid="figure4">Figure 4</xref>. Unlike traditional retrieval-augmented generation methods, the dual-module architecture of this agent is designed to achieve retrieval enhancement by leveraging the semantic analysis capability of LLM. The automatic processing module structures clinical data into standardized medical categories, including chief complaint, present illness, past history, family history, and examination results, and then segments each category into minimal semantic units. For retrieval, the agent uses prompt engineering to guide the core LLM in performing semantic analysis and selecting the most relevant partitions. When it receives a user query, the agent then processes multithreaded parallel tasks to extract the most relevant information from each module. Finally, the agent aggregates these results to return the most relevant case report information.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>The workflow for automatic processing and retrieval of clinical case reports of the memory management agent. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig04.png"/></fig><p><italic>The information processing agent</italic> functionally maps to the prefrontal-temporal cortex, handling input query analysis and semantic processing to generate responses during clinical interview simulations. Initially, the agent uses an LLM to assess the quality of a medical consultation, evaluating communication competencies, such as therapeutic rapport, verbal clarity, and proper use of medical terminology. The system then automatically classifies questioning patterns into clinically relevant categories, such as open-ended facilitation and closed-ended interrogation. Finally, the framework uses these analytical labels to dynamically select appropriate response strategies to guide the language generation agent, as detailed in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Content of dynamic prompts, including consistency prompts and variable prompts guided by the information processing agent.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig05.png"/></fig><p><italic>The language generation agent</italic> generates replies by integrating inputs from previous modules, including the patient&#x2019;s personality profile, medical records, and the current query context, all while keeping track of the conversation&#x2019;s memory state.</p><p><italic>The rethinking agent</italic> corresponds to the monitoring functions of the prefrontal network, implementing a quality control mechanism to evaluate generated responses against SP requirements. Its key validation criteria include (1) avoidance of physician query repetition, (2) maintenance of patient-appropriate knowledge levels, (3) clinical relevance optimization, and (4) pedagogical alignment through information dosage control. Validated outputs are incorporated into the evolving dialogue history.</p></sec><sec id="s2-5"><title>Inquiry Test Set</title><p>To rigorously evaluate the VP, we developed both a standard inquiry test set and a low-quality inquiry test set, ensuring all generated data underwent expert verification by clinical faculty. The standard inquiry test set was constructed to assess response consistency under common conditions, where inquiries were generated via a few-shot learning paradigm. In the generation process, standard dialogue scripts manually authored by experienced medical teachers served as prompts to guide the GPT-4 (OpenAI) in generating standardized inquiries derived from clinical case reports. In contrast, to evaluate system robustness against novice errors, we developed the low-quality inquiry test set using a zero-shot learning approach. Drawing on prior research [<xref ref-type="bibr" rid="ref35">35</xref>-<xref ref-type="bibr" rid="ref37">37</xref>], we identified 5 prevalent and empirically testable error types, such as vague inquiries and terminology stacking, as detailed in <xref ref-type="table" rid="table1">Table 1</xref>. GPT-4 was prompted to generate flawed inquiry samples based strictly on these error definitions across 3 randomly selected dental cases, producing 3 distinct examples per category to ensure comprehensive coverage and validity.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Example of low-quality inquiry.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Type</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Example</td></tr></thead><tbody><tr><td align="left" valign="top">Vague and ineffective</td><td align="left" valign="top">Characterized by unfocused or overly broad questioning that fails to elicit specific, clinically relevant details</td><td align="left" valign="top">When you had scaling done at the outside hospital four years ago, did the doctor use an ultrasonic scaler or manual curettage?</td></tr><tr><td align="left" valign="top">Terminology stacking</td><td align="left" valign="top">The excessive use of complex medical jargon without adequate lay explanation</td><td align="left" valign="top">Has this tooth previously undergone root canal filling or pulp mummification therapy?</td></tr><tr><td align="left" valign="top">Rigid template application</td><td align="left" valign="top">A mechanical adherence to standard history-taking protocols, often resulting in irrelevant or redundant lines of questioning</td><td align="left" valign="top">I see your medical record notes atrial fibrillation. So, did you suffer from rheumatic fever as a child? Is there a family history of cardiac issues?</td></tr><tr><td align="left" valign="top">Leading questions</td><td align="left" valign="top">The phrasing of inquiries in a manner that suggests a specific answer or introduces bias</td><td align="left" valign="top">When you were in severe pain two days ago, drinking cold water made it hurt, right? This is important, so think carefully&#x2014;it must have been painful, correct?</td></tr><tr><td align="left" valign="top">Lack of humanistic care</td><td align="left" valign="top">Neglecting the patient&#x2019;s emotional state, anxiety, or need for empathy, resulting in insufficient doctor-patient rapport</td><td align="left" valign="top">Oh my, your face is swollen asymmetrically. It looks crooked... actually, it looks quite frightening.</td></tr></tbody></table></table-wrap></sec><sec id="s2-6"><title>Evaluation</title><p>Our evaluation was conducted in 2 phases. In the first phase, we systematically evaluated 3 representative LLMs: Qwen3-32B (Alibaba Cloud) [<xref ref-type="bibr" rid="ref38">38</xref>], DeepSeek-V3 (DeepSeek-AI) [<xref ref-type="bibr" rid="ref39">39</xref>], and GPT-4o (OpenAI) [<xref ref-type="bibr" rid="ref40">40</xref>], as potential foundational models for our multiagent framework. Notably, the reasoning mode of Qwen3-32B was disabled for both the multiagent framework and the single-LLM baseline. This setting was constrained by the latency of real-time interactions. Preliminary tests indicated that enabling the reasoning mode resulted in excessive response latency averaging more than 18 seconds, which is unacceptable for practical application.</p><p>We evaluated the dialogue transcripts obtained by applying the inquiry test set to the VPs, constructed from 5 specialized clinical case reports, including internal medicine, surgery, gynecology, pediatrics, and dentistry. A comprehensive performance evaluation was conducted using the metrics detailed in <xref ref-type="table" rid="table2">Table 2</xref>. First, we assessed factual consistency using text similarity metrics evaluated on cases from 5 medical specialties based on the standard inquiry test set. We adopted a human-machine collaboration approach to systematically score other metrics. And then, 5 attending physicians from Peking University School and Hospital of Stomatology were invited to conduct evaluations using a rating scale. Specifically, accuracy rate, misleading rate, and perceived educational utility were assessed using the standard inquiry test set, while the role-playing ability was evaluated using both the standard and low-quality inquiry test sets. Physicians assessed the VPs constructed based on dental cases. To ensure objectivity, a blinded evaluation was enforced in which experts were blinded to the underlying model architecture, and model outputs were presented in a randomized order to eliminate sequence bias. Furthermore, all evaluations were conducted anonymously to preserve the integrity of the assessment process. Meanwhile, we compared these evaluations with the results generated by GPT-4, a widely used LLM-as-judge [<xref ref-type="bibr" rid="ref41">41</xref>], to verify the reliability of LLM-based assessments. Subsequently, we expanded the cases to 4 other specialties, conducted assessments using the LLM-as-judge method, and analyzed the scalability of our framework. Note that establishing comprehensive scalability requires further validation through assessments by experts from different departments. Finally, we evaluated the interaction efficiency and multiturn dialogue performance to observe the VP&#x2019;s performance during realistic multiround interactions.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Evaluation dimension.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Evaluation dimension</td><td align="left" valign="bottom">Evaluation by</td><td align="left" valign="bottom">Description of value</td><td align="left" valign="bottom">Metrics</td></tr></thead><tbody><tr><td align="left" valign="top">Factual consistency</td><td align="left" valign="top">Researchers</td><td align="left" valign="top">Complementary measures to validate the delivery of essential educational information</td><td align="left" valign="top">Similarity metrics</td></tr><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">Medical experts</td><td align="left" valign="top">Complementary measures to validate the delivery of essential educational information</td><td align="left" valign="top">Average score of scale</td></tr><tr><td align="left" valign="top">Misleading rate</td><td align="left" valign="top">Medical experts</td><td align="left" valign="top">Detect personality logical inconsistencies</td><td align="left" valign="top">Average score of scale</td></tr><tr><td align="left" valign="top">Scalability</td><td align="left" valign="top">Researchers and LLM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">Evaluate the scalability of using cases from different departments</td><td align="left" valign="top">Factual consistency and role-playing ability score</td></tr><tr><td align="left" valign="top">Role-playing ability</td><td align="left" valign="top">Medical experts and LLM</td><td align="left" valign="top">Verify the authenticity of the patient persona</td><td align="left" valign="top">Role-playing score evaluated by standard inquiries and scoring of responses to low-quality inquiries</td></tr><tr><td align="left" valign="top">Perceived educational utility</td><td align="left" valign="top">Medical experts</td><td align="left" valign="top">Assess the VP<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup>&#x2019;s perceived usability</td><td align="left" valign="top">Average score of scale</td></tr><tr><td align="left" valign="top">Interaction efficiency</td><td align="left" valign="top">Researchers</td><td align="left" valign="top">Characterized the practical usability and output stability of the multiagent system</td><td align="left" valign="top">Average response time and average token counts</td></tr><tr><td align="left" valign="top">Multiturn dialogue performance</td><td align="left" valign="top">Researchers</td><td align="left" valign="top">Characterized the practical usability and output stability of the multiagent system</td><td align="left" valign="top">Average dialogue turns, response length, and information accumulation curve</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>LLM: large language model.</p></fn><fn id="table2fn2"><p><sup>b</sup>VP: virtual patient. </p></fn></table-wrap-foot></table-wrap><p>To further investigate the performance of our multiagent framework in SP simulation, we conducted a second-phase experiment. This subsequent evaluation selected the best-performing LLM for our framework from the first phase and compared our multiagent framework against a single-LLM baseline using identical assessment metrics. The single-LLM baseline used identical instructor requirements and user inputs as the multiagent framework, including teacher requirements for patient profile and natural language query, with detailed prompts provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. For each session, it retrieved the corresponding disease case report to formulate its responses. The controlled comparison specifically examined the enhancement effects of the multiagent framework on LLM-based role-playing ability. Detailed descriptions of each evaluation metric are provided below.</p><p>Factual consistency is defined as the faithfulness of the disease information in the response to the original case report details. We assessed factual consistency by segmenting medical records into minimal fact units and measuring their semantic similarity with VP responses to the standard inquiry test set using the bge-small-zh-v1.5 model (Beijing Academy of Artificial Intelligence) [<xref ref-type="bibr" rid="ref42">42</xref>]. Case scalability was then statistically analyzed using statistical approaches, including the coefficient of variation (CV). This assessment verifies whether response contents are substantiated by evidence from medical records, thereby preventing hallucinations or inaccuracies.</p><p>Accuracy rate was assessed by measuring the framework&#x2019;s ability to generate clinically correct responses based on clinical case reports. Experts primarily assessed whether the responses to the standard inquiry test set contained logical errors or clinical inconsistencies and performed binary evaluations to calculate the accuracy rate as the percentage of correct responses relative to the total number of dialogues. Misleading rates were similarly determined through binary expert scoring, defined as the proportion of medically inaccurate responses based on information not derived from clinical case reports in the total dialogue set. The SP simulations incorporated both case-specific medical information and artificially generated patient characteristics, including occupation, age, and lifestyle factors. These additional features enhanced simulation realism but may compromise the correctness of the responses, potentially leading to misleading information.</p><p>The assessment of role-playing ability and perceived educational utility, based on the standard inquiry test set, used a modified 5-point Likert scale [<xref ref-type="bibr" rid="ref43">43</xref>], covering 8 performance dimensions as presented in <xref ref-type="table" rid="table3">Table 3</xref>. Perceived educational utility was evaluated by physicians based on the suitability of dialogues for direct use as SP responses in clinical teaching. A machine-expert evaluation approach was adopted: medical physicians independently assessed a subset of dialogues, providing Likert scores for role-playing ability and perceived educational utility scores, while GPT-4 evaluated role-playing ability using the same scale criteria. Simultaneously, we used the low-quality inquiry test set to interact with the VP and generated corresponding dialogue logs as an exploratory analysis. As the standardized 5-point Likert scale was not fully suitable for evaluating low-quality responses, 5 physicians were invited to conduct a binary assessment of the VP&#x2019;s responses. The primary evaluation criterion was consistency with the assigned persona. Specifically, physicians judged whether the VP exhibited appropriate emotional and behavioral reactions, such as expressing skepticism, confusion, or rebuttal, when confronted with these low-quality inquiries, rather than simply providing compliant answers.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Dimensions and criteria for role-playing ability evaluation of a 5-point Likert scale<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Competency dimension and performance metrics</td><td align="left" valign="bottom">Fulfillment criteria</td></tr></thead><tbody><tr><td align="left" valign="top">The appeal of role-playing</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Anthropomorphism degree</td><td align="left" valign="top">Respond naturally and express reasonable human emotions and personality.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Diversity of expression</td><td align="left" valign="top">Dialogue behaviors and utterances are rich and diverse, avoiding repetitive expressions.</td></tr><tr><td align="left" valign="top">Role consistency</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Knowledge exposure</td><td align="left" valign="top">The response appropriately reflects the character&#x2019;s background traits.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Knowledge hallucination</td><td align="left" valign="top">There is no fabrication of information unknown to the character and no violation of the character&#x2019;s settings.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Stylistic consistency</td><td align="left" valign="top">The speaking style, wording habits, and tone descriptions conform to the patient&#x2019;s personality and characteristics.</td></tr><tr><td align="left" valign="top">Conversation ability</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Fluency</td><td align="left" valign="top">The response is grammatically correct, and the expression is smooth.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Relevance</td><td align="left" valign="top">The response closely adheres to the conversation topic without deviation.</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Logical consistency</td><td align="left" valign="top">In multiturn conversations, the responses are logically consistent and free of contradictions.</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>The evaluation architecture incorporates 3 categorical elements: competency dimensions, performance metrics, and fulfillment criteria, collectively designed to quantify the system&#x2019;s clinical interaction fidelity.</p></fn></table-wrap-foot></table-wrap><p>As for interaction efficiency and multiturn performance of the multiagent VP, practical efficiency was assessed by measuring average response time and token counts on a standard question set per dialogue turn. For multiturn performance, an autonomous physician agent was developed to simulate full consultations, guided by standard medical protocols and terminating upon complete information retrieval. Ground truth clinical information points in clinical case reports were annotated by 3 physicians, while GPT-4 served as an evaluator to identify these points in the dialogue history. An information accumulation curve was derived by tracking, for each turn, the cumulative percentage of disclosed information, defined as the ratio of cumulative information points identified to the total points in the ground truth. Metrics, including average dialogue turns, response length, and the accumulation curve, were analyzed to ensure the trajectory of clinical information disclosure aligned with the intended use requirements.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Statistical analyses were performed using Python (version 3.10; SciPy library). Continuous data were expressed as means (SD). Interrater reliability among experts was evaluated using the Gwet AC1 statistic. Given the matched-sample nature of the evaluations, overall performance differences across the 4 models for metrics, including accuracy, misleading rates, factual consistency, and role-playing ability, were assessed using the Friedman test. Subsequent post hoc pairwise comparisons were conducted using the 2-sided Wilcoxon signed-rank test. To rigorously control for multiple testing, the Benjamini-Hochberg false discovery rate procedure was applied to all pairwise <italic>P</italic> values, with an adjusted <italic>P</italic>&#x003C;.05 considered statistically significant.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Clinical Case Reports Repository Construction Results</title><p>Our MAS dynamically processes raw clinical case reports through the memory management agent, enabling real-time structuring without complex preprocessing. This design preserves original medical narratives while generating structured representations during operation, maintaining flexibility for simulating diverse clinical scenarios. The content of the original case report after segmentation and structuring by the framework is presented in <xref ref-type="table" rid="table4">Table 4</xref>, which illustrates the result of preserving information in accordance with minimal information units, in order to reduce interference between different pieces of information.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Dictionary of clinical case reports after agent-based structured processing.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Module<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup> and unit<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup></td><td align="left" valign="bottom">Fact unit<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">Chief complaint</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">Pain in the right posterior tooth and facial swelling for 4 d</td></tr><tr><td align="left" valign="top">Present illness</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">Spontaneous pain in the right posterior tooth during biting and gingival swelling pain for 4 d, with gradual pain relief but persistent facial swelling</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">No cold or heat sensitivity, night pain, or gingival pus discharge</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">Self-medicated with &#x201C;metronidazole and ibuprofen&#x201D; 3 d ago with some pain relief</td></tr><tr><td align="left" valign="top">Family history</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">No significant history</td></tr><tr><td align="left" valign="top">Physical examination</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">Atrial fibrillation and no recent checkups</td></tr><tr><td align="left" valign="top">Clinical findings</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">Right facial swelling with slightly elevated skin temperature and normal skin color</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">46MO<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup>: extensive caries, percussion pain (++), grade I mobility, and swollen buccal gingival sulcus without fluctuation</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">X-ray shows: crown radiolucency reaching pulp chamber, no root canal filling, and periapical radiolucency</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>4</td><td align="left" valign="top">48: impacted tooth</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>5</td><td align="left" valign="top">Poor oral hygiene with calculus ++, PD<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup>: 4&#x2010;6 mm, detectable AL<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup></td></tr><tr><td align="left" valign="top">Diagnosis</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">46: acute exacerbation of chronic periapical periodontitis</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">48: impacted tooth</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">Maxillary dentition defect</td></tr><tr><td align="left" valign="top">Management</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>1</td><td align="left" valign="top">Loxoprofen sodium tablets 60 mg*36&#x00D7;1 box. Dosage: 60 mg PRN<sup><xref ref-type="table-fn" rid="table4fn7">g</xref></sup> orally</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>2</td><td align="left" valign="top">Tinidazole tablets 0.5 g*8&#x00D7;2 boxes. Dosage: 0.50 g bid orally</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>3</td><td align="left" valign="top">Amoxicillin capsules 0.25 g*24&#x00D7;1 box. Dosage: 0.50 g tid orally</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>Module refers to the designated segments into which a clinical case report is partitioned.</p></fn><fn id="table4fn2"><p><sup>b</sup>Unit refers to the number of textual units into which the case information of each module is partitioned according to predefined rules. For example, the &#x201C;present illness&#x201D; section may be segmented into 4 discrete units of clinical information.</p></fn><fn id="table4fn3"><p><sup>c</sup>Fact unit refers to the content obtained from the segmentation of specific case reports.</p></fn><fn id="table4fn4"><p><sup>d</sup>46MO: 46 mesio-occlusal.</p></fn><fn id="table4fn5"><p><sup>e</sup>PD: pocket depth.</p></fn><fn id="table4fn6"><p><sup>f</sup>AL: attachment loss.</p></fn><fn id="table4fn7"><p><sup>g</sup>PRN: pro re nata.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Evaluation Results</title><sec id="s3-2-1"><title>Evaluation Situation</title><p>Our output dataset comprised 650 physician-patient dialogues across 5 major medical specialties covering internal medicine, surgery, gynecology, pediatrics, and dentistry. Specifically, the expert evaluation covered 150 dialogue records of dental cases. The results of evaluation yielded several key findings, demonstrating the strong performance of the multiagent framework in simulating SPs. These results are presented across several main areas: factual consistency, accuracy and misleading rates, role-playing ability, and perceived educational utility.</p></sec><sec id="s3-2-2"><title>Factual Consistency</title><p>Friedman test revealed a significant overall difference in factual consistency across methods (n=10; <italic>&#x03C7;</italic>&#x00B2;<sub>3</sub>=13.0; <italic>P</italic>=.04). As <xref ref-type="fig" rid="figure6">Figure 6</xref> illustrates, the GPT-4o&#x2013;based approach performed best, leading across most departments with the highest mean score of 0.769 (SD 0.04), and significantly outperforming the single-LLM baseline (adjusted <italic>P</italic>=.03). While the Qwen3-32B multiagent framework achieved a higher mean factual consistency score of 0.734 (SD 0.06) compared to the single-LLM Qwen3-32B approach 0.699 (SD 0.07), this numerical improvement did not reach statistical significance (adjusted <italic>P</italic>=.33). Furthermore, DeepSeek-V3&#x2013;based performance was similar to the Qwen3-32B&#x2013;based framework. Across all methods, performance was consistently best in surgery and worst in pediatrics.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Factual consistency scores and mean values of multiagent framework with different base models. The dotted lines indicate the average (avg) score for each method.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig06.png"/></fig></sec><sec id="s3-2-3"><title>Accuracy and Misleading Rate</title><p>This study evaluated the accuracy of clinical information within dialogues generated from dental case reports. All methods demonstrated high proficiency, with accuracy rates exceeding 94%. Statistical analysis revealed a significant overall performance variance among the models (<italic>&#x03C7;</italic>&#x00B2;<sub>3</sub>=12.0; <italic>P</italic>=.007; <xref ref-type="table" rid="table5">Table 5</xref>). The DeepSeek-V3&#x2013;based framework and the single-LLM baseline emerged as the top-performing approaches (mean 97.44, SD 1.24). Conversely, the GPT-4o&#x2013;based framework recorded the lowest relative accuracy and was significantly outperformed by both the single-LLM baseline (adjusted <italic>P</italic>=.03) and the DeepSeek-V3 framework (adjusted <italic>P</italic>=.03). Notably, despite achieving the highest mean accuracy, the single-LLM baseline did not demonstrate a statistically significant advantage over the Qwen3-32B model (adjusted <italic>P</italic>=.17). This comparable performance indicates that the multiagent framework did not yield a significant difference in accuracy over the single-LLM approach.</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Accuracy and misleading rates of different methods (N=8)<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup>.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Accuracy (%), mean (SD)</td><td align="left" valign="bottom">Misleading (%), mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Single-LLM-Qwen3-32B</td><td align="left" valign="top">97.44 (1.24)<sup>A</sup></td><td align="left" valign="top">4.72 (1.53)<sup>A</sup></td></tr><tr><td align="left" valign="top">DeepSeek-V3&#x2013;based</td><td align="left" valign="top">97.44 (1.24)<sup>A</sup></td><td align="left" valign="top">1.39 (0.87)<sup>B</sup></td></tr><tr><td align="left" valign="top">Qwen3-32B&#x2013;based</td><td align="left" valign="top">96.15 (2.05)<sup>A,B</sup></td><td align="left" valign="top">1.28 (1.20)<sup>B</sup></td></tr><tr><td align="left" valign="top">GPT-4o&#x2013;based</td><td align="left" valign="top">94.87 (1.82)<sup>B</sup></td><td align="left" valign="top">2.56 (1.01)<sup>A,B</sup></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>Interrater reliability analysis showed high consensus (Gwet AC1: 0.93 for accuracy and 0.96 for misleading rate).</p></fn><fn id="table5fn2"><p><sup>b</sup>Superscript letters (Aand B) across rows denote significant pairwise differences (adjusted <italic>P</italic>&#x003C;.05). Models sharing the same letter do not differ significantly.</p></fn></table-wrap-foot></table-wrap><p>For noncase report&#x2013;based information, all methods demonstrated misleading rates under 5%, again showing a significant overall difference (<italic>&#x03C7;</italic>&#x00B2;<sub>3</sub>=15.73; <italic>P</italic>=.001; <xref ref-type="table" rid="table5">Table 5</xref>). The single-LLM Qwen3-32B baseline showed significantly higher misleading rates than the multiagent frameworks based on Qwen3-32B (adjusted <italic>P</italic>=.003) and DeepSeek-V3 (adjusted <italic>P</italic>=.004). Differences between other models regarding misleading rates were not significant (adjusted <italic>P</italic>&#x2265;.11). The multiagent framework may reduce hallucinations by retrieving only case-relevant information.</p></sec><sec id="s3-2-4"><title>Role-Playing Ability and Perceived Educational Utility Under Standard Conditions</title><p>For the role-playing competency assessment based on a standard inquiry test set, experts scored 8 dental case dialogues using an 8-item scale across 3 dimensions. The detailed scores for each dimension are presented in <xref ref-type="table" rid="table6">Table 6</xref>. Overall, a statistically significant difference in total role-playing competency scores among the 4 models was observed (<italic>&#x03C7;</italic>&#x00B2;<sub>3</sub>=16.6; <italic>P</italic>&#x003C;.001). The results demonstrated that the Qwen3-32B&#x2013;based framework achieved a significantly higher total score (mean 39.67, SD 0.71) compared to the GPT-4o&#x2013;based framework (<italic>P</italic>=.008) and the single-LLM baseline (<italic>P</italic>=.04). Differences between DeepSeek-V3&#x2013;based and either Qwen3-32B&#x2013;based (<italic>P</italic>=.31) or the single-LLM Qwen3-32B (<italic>P</italic>=.48) were not statistically significant. A detailed analysis of score distributions across the 3 evaluation dimensions was conducted in <xref ref-type="fig" rid="figure7">Figure 7</xref>. Regarding role attractiveness, the role-playing degree and expressive diversity exhibited by the VPs were the primary focus of assessment. Examples of VP responses from different methods are presented in <xref ref-type="table" rid="table7">Table 7</xref>. The results indicated that Qwen3-32B&#x2013;based agents and DeepSeek-V3&#x2013;based agents achieved 98.3% (9.83/10) of the maximum score for the appeal of role-playing, demonstrating diverse human-like characteristics, natural interaction, and appropriate emotion. The GPT-4o&#x2013;based system scored 88.3% (SD 0.07; 8.83/10), exhibiting a more direct, rational style with occasional neglect of patient personality. The single-LLM baseline obtained a score between these top-performing models and the GPT-4o&#x2013;based system.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Detailed scores for the 8 consistency issues in the dental cases, as evaluated by physicians using the standardized scale (N=8)<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Methods</td><td align="left" valign="bottom" colspan="2">The appeal of role-playing, mean</td><td align="left" valign="bottom" colspan="3">Role consistency, mean</td><td align="left" valign="bottom" colspan="3">Conversation ability, mean</td><td align="left" valign="bottom">Total score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">AD<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup> (Krippendorff &#x03B1;=0.67)</td><td align="left" valign="bottom">DE<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup> (Krippendorff &#x03B1;=0.66)</td><td align="left" valign="bottom">KE<sup><xref ref-type="table-fn" rid="table6fn4">d</xref></sup> (Krippendorff &#x03B1;=0.72)</td><td align="left" valign="bottom">KH<sup><xref ref-type="table-fn" rid="table6fn5">e</xref></sup> (Krippendorff &#x03B1;=0.77)</td><td align="left" valign="bottom">SC<sup><xref ref-type="table-fn" rid="table6fn6">f</xref></sup> (Krippendorff &#x03B1;=0.73)</td><td align="left" valign="bottom">Flu.<sup><xref ref-type="table-fn" rid="table6fn7">g</xref></sup> (Krippendorff &#x03B1;=0.87)</td><td align="left" valign="bottom">Rel.<sup><xref ref-type="table-fn" rid="table6fn8">h</xref></sup> (Krippendorff &#x03B1;=0.87)</td><td align="left" valign="bottom">LS<sup><xref ref-type="table-fn" rid="table6fn9">i</xref></sup> (Krippendorff &#x03B1;=0.87<sup><xref ref-type="table-fn" rid="table6fn10">j</xref></sup>)</td><td align="left" valign="bottom"/></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o&#x2013;based</td><td align="left" valign="top">4.33</td><td align="left" valign="top">4.50</td><td align="left" valign="top">4.50</td><td align="left" valign="top">4.50</td><td align="left" valign="top">4.67</td><td align="left" valign="top">4.83</td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">4.67</td><td align="left" valign="top">37.33<sup>C</sup></td></tr><tr><td align="left" valign="top">Single-LLM-Qwen3-32B</td><td align="left" valign="top">4.67</td><td align="left" valign="top">4.50</td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">4.67</td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">38.83<sup>B</sup></td></tr><tr><td align="left" valign="top">Qwen3-32B&#x2013;based</td><td align="left" valign="top">4.83</td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">4.83<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">39.67<sup>A,<xref ref-type="table-fn" rid="table6fn11">k</xref>,<xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td></tr><tr><td align="left" valign="top">DeepSeek-V3&#x2013;based</td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">4.83</td><td align="left" valign="top">4.67</td><td align="left" valign="top">4.83<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup></td><td align="left" valign="top">4.83</td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">5.00<sup><xref ref-type="table-fn" rid="table6fn11">k</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table6fn12">l</xref></sup></td><td align="left" valign="top">39.17<sup>A,B</sup></td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>Superscript letters (A, B, and C) alongside the total scores denote significant pairwise differences (adjusted <italic>P</italic>&#x003C;.05). Models sharing the same letter do not differ significantly.</p></fn><fn id="table6fn2"><p><sup>b</sup>AD: anthropomorphism degree. </p></fn><fn id="table6fn3"><p><sup>c</sup>DE: diversity of expression. </p></fn><fn id="table6fn4"><p><sup>d</sup>KE: knowledge exposure. </p></fn><fn id="table6fn5"><p><sup>e</sup>KH: knowledge hallucination. </p></fn><fn id="table6fn6"><p><sup>f</sup>SC: stylistic consistency. </p></fn><fn id="table6fn7"><p><sup>g</sup>Flu.: fluency. </p></fn><fn id="table6fn8"><p><sup>h</sup>Rel.: relevance. </p></fn><fn id="table6fn9"><p><sup>i</sup>LS: logical consistency.</p></fn><fn id="table6fn10"><p><sup>j</sup>Interrater reliability indicating strong expert consensus.</p></fn><fn id="table6fn11"><p><sup>k</sup>Optimal values for each indicator. </p></fn><fn id="table6fn12"><p><sup>l</sup>Optimal values</p></fn></table-wrap-foot></table-wrap><fig position="float" id="figure7"><label>Figure 7.</label><caption><p>A detailed analysis of score distributions across the 3 evaluation dimensions evaluated by physicians.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig07.png"/></fig><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Example comparison results of dialogue transcripts between multiagent frameworks with different base models and single-large language model approaches.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Method</td><td align="left" valign="bottom">Communication record<break/>Example: Is there heavy bleeding from your gums? Does it stop by itself?</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o&#x2013;based</td><td align="left" valign="top">The bleeding is not heavy and usually stops on its own.</td></tr><tr><td align="left" valign="top">Qwen3-32B&#x2013;based</td><td align="left" valign="top">The bleeding isn&#x2019;t heavy, just some blood streaks when brushing, and occasionally a little bleeding even with light contact, but it stops on its own and resolves quickly. I&#x2019;m a bit concerned, could this be some kind of inflammation?</td></tr><tr><td align="left" valign="top">Single-LLM-Qwen3-32B</td><td align="left" valign="top">It&#x2019;s not much bleeding, just a little bit. Sometimes it looks slightly red right after brushing, but it stops on its own after a while.</td></tr><tr><td align="left" valign="top">DeepSeek-V3&#x2013;based</td><td align="left" valign="top">The bleeding isn&#x2019;t heavy, just some pink streaks on my toothbrush that go away after rinsing a couple times. But it keeps oozing like this, and I&#x2019;m worried. Could there be a risk it might suddenly stop clotting one day?</td></tr></tbody></table></table-wrap><p>Regarding the role consistency on the standard inquiry test set, Qwen3-32B&#x2013;based agents achieved 98.9% (SD 0.02; 14.83/15), effectively portraying character settings. In contrast, GPT-4o&#x2013;based agents scored 91.1% (SD 0.01; 13.67/15), exhibiting fabrication or neglect of character details and limited background representation. Regarding conversational competence, all methods achieved scores exceeding 96.7% (SD 0.02%; 14.5/15), closely adhering to the defined criteria of grammatical accuracy, fluency of expression, logical consistency across multiple turns, thematic relevance, and absence of deviation. This suggests that current LLMs can effectively simulate SP dialogues.</p><p>Meanwhile, comparative analysis revealed that the LLM-as-judge scores, assessed by GPT-4 using standardized rating scales, were systematically lower than expert ratings. The overall scoring trends demonstrated substantial alignment between both evaluation methods, as shown in <xref ref-type="fig" rid="figure8">Figure 8</xref>. <xref ref-type="fig" rid="figure9">Figure 9</xref> presents the LLM-as-judge evaluation results across all 5 clinical specialties on average. Both Qwen3-32B and DeepSeek-V3 achieved the highest scores in 2 distinct dimensions, consistent with trends observed in leading performance rankings.</p><fig position="float" id="figure8"><label>Figure 8.</label><caption><p>Total role-playing score performance of virtual patient by physicians (dental), LLM-as-judge (dental), and the average scores of LLM-as-judge across 5 clinical specialties. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig08.png"/></fig><fig position="float" id="figure9"><label>Figure 9.</label><caption><p>Role-playing competency scores of virtual patients across 5 specialties during simulated dialogues evaluated by GPT-4.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig09.png"/></fig><p>In terms of perceived educational utility, the multiagent framework based on the DeepSeek-V3 and Qwen3-32B received full scores from medical physicians under standard inquiry conditions, as illustrated in <xref ref-type="table" rid="table8">Table 8</xref>. This indicates their applicability for physician-patient communication training within the medical education process.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Perceived educational utility score for the standard inquiry test set assessing the acceptability level of applying virtual patient in real physician-patient communication training<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup>.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Methods</td><td align="left" valign="bottom">Score, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o&#x2013;based</td><td align="left" valign="top">4.50 (0.71)</td></tr><tr><td align="left" valign="top">Single-LLM-Qwen3-32B</td><td align="left" valign="top">4.83 (0.24)</td></tr><tr><td align="left" valign="top">Qwen3-32B&#x2013;based</td><td align="left" valign="top">5.00 (0.00)</td></tr><tr><td align="left" valign="top">DeepSeek-V3&#x2013;based</td><td align="left" valign="top">5.00 (0.00)</td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>The interrater reliability (Krippendorff &#x03B1;) for this metric was 0.75, indicating strong expert consensus. Statistical significance testing was not performed, as it is intended to provide a holistic perspective from clinicians on the overall educational value of the system.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2-5"><title>Exploratory Analysis of Role-Playing Ability Under Low-Quality Inquiry</title><p>As for the exploratory analysis of the low-quality inquiry test set, our multiagent framework demonstrated limited advantages in specific scenarios, effectively identifying low-quality medical inquiries, such as terminology stacking and rigid application, and responding according to predefined role configurations, as shown in <xref ref-type="fig" rid="figure10">Figure 10</xref>. However, the framework exhibits insufficient discriminative power in handling other types of low-quality inquiries. Specifically, regarding the lack of humanistic care, scores remained consistently high across different methods without significant differences. In contrast, performance was notably inferior when addressing leading questions, in which the model demonstrated a tendency toward overreaction.</p><fig position="float" id="figure10"><label>Figure 10.</label><caption><p>Performance scores of virtual patients in response to low-quality medical inquiries. Interrater reliability (Gwet AC1) for each dimension is as follows: dimension VI (AC1=0.66), dimension Ts (AC1=0.84), dimension Ra (AC1=0.62), dimension Lq (AC1=0.63), and dimension LHC (AC1=0.96), confirming great agreement among experts on these binary metrics.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig10.png"/></fig></sec><sec id="s3-2-6"><title>Scalability</title><p>We preliminarily evaluated the scalability of our framework by assessing its factual consistency and role-playing performance, using LLM-as-judge evaluations as a preliminary exploratory approach. The line graph in <xref ref-type="fig" rid="figure11">Figure 11</xref> illustrates the performance variations of each method across case reports from different clinical departments. Trend analysis reveals that, with the exception of the GPT-4o&#x2013;based framework, the case reports from different departments had a consistent impact on the performance trends of the other methods. The factual consistency scores for pediatrics were notably lower than those of the other 4 departments, indicating that case reports from this specialty may have a substantial impact on the VP&#x2019;s responses. The other 4 departments exhibited similar scoring ranges, with our method demonstrating stable fluctuations within a high-scoring interval (0.7&#x2010;0.85), suggesting good generalizability and broad applicability across diverse case reports.</p><fig position="float" id="figure11"><label>Figure 11.</label><caption><p>Corresponding score distribution curves of factual consistency results for multiagent frameworks with different specialty case reports and different base models.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig11.png"/></fig><p>Analysis of the CV revealed that the GPT-4o&#x2013;based framework (CV=4.7%) exhibited the highest stability, while single-LLM Qwen3-32B framework (CV=8.7%) showed slightly greater variability. As illustrated in <xref ref-type="table" rid="table9">Table 9</xref>, the overall model performance was minimally affected by clinical departments. However, factual consistency scores for single-LLM-Qwen3-32B framework varied more substantially across departments, indicating lower scalability compared to the multiagent framework.</p><table-wrap id="t9" position="float"><label>Table 9.</label><caption><p>Comparison of the coefficient of variation (CV) across different methods.</p></caption><table id="table9" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Methods</td><td align="left" valign="bottom">CV, %</td></tr></thead><tbody><tr><td align="left" valign="top">GPT-4o&#x2013;based</td><td align="left" valign="top">4.7</td></tr><tr><td align="left" valign="top">Qwen3-32B&#x2013;based</td><td align="left" valign="top">7.1</td></tr><tr><td align="left" valign="top">Single-LLM-Qwen3-32B</td><td align="left" valign="top">8.7</td></tr><tr><td align="left" valign="top">DeepSeek-V3&#x2013;based</td><td align="left" valign="top">5.7</td></tr></tbody></table></table-wrap><p>The LLM-as-judge evaluation across different specialty-specific VP dialogues revealed that our framework demonstrated stable cross-specialty performance , with limited score variations in the appeal of role-playing (<italic>&#x0394;</italic>&#x003C;2), outperforming single-LLM baseline&#x2019;s greater variability, as shown in <xref ref-type="fig" rid="figure12">Figure 12</xref>. DeepSeek-V3 based showed minimal fluctuations in role attractiveness (<italic>&#x0394;</italic>=1). These findings suggest that while our framework enhances scalability in the appeal of role-playing in certain clinical departments, the performance of the base model should be considered as a critical factor during deployment.</p><fig position="float" id="figure12"><label>Figure 12.</label><caption><p>Performance scores of role-playing competencies across specialties based on GPT-4 automated evaluation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig12.png"/></fig></sec><sec id="s3-2-7"><title>Interaction Efficiency and Multiturn Dialogue Performance</title><p>Regarding interaction efficiency, the Qwen3-32B&#x2013;based multiagent VP achieved the fastest response rate among the 3 backbone models, slightly longer than the single-LLM Qwen3-32B. For the Qwen3-32B&#x2013;based VP, the average latency of approximately 3 seconds fell within an acceptable range for user interaction, as shown in <xref ref-type="fig" rid="figure13">Figure 13</xref>. In terms of resource usage, DeepSeek-V3&#x2013;based VP proved the most efficient (mean 1219.64 tokens (SD 130.95)), whereas the Qwen3-32B&#x2013;based VP consumed the most. Although the Qwen3-32B&#x2013;based VP used more tokens than the single-LLM baseline, its consumption remained stable across sessions, contrasting with the high variance observed in the single-LLM baseline.</p><fig position="float" id="figure13"><label>Figure 13.</label><caption><p>Interaction efficiency evaluation based on average time and average token.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig13.png"/></fig><p>For multiturn performance, the Qwen3-32B&#x2013;based VP achieved the highest engagement and average response length, averaging 13 turns according to <xref ref-type="table" rid="table10">Table 10</xref> and <xref ref-type="fig" rid="figure14">Figure 14</xref>. Regarding the dynamics of the interaction, the analysis of information accumulation in <xref ref-type="fig" rid="figure15">Figure 15</xref> reveals distinct differences in information pacing. Specifically, the baselines built on GPT-4o and Qwen3-32B exhibited an information accumulation rate trend characterized by an initial rise, subsequent fall, and final rise. In contrast, DeepSeek-V3 and the single-LLM baseline approach showed a trajectory, featuring a rapid initial rise followed by a decline. These diverging patterns highlight variations in information flow during the dialogue. The rapid early accumulation observed in the latter group suggests a more rapid information release strategy, which could indicate either higher efficiency or a tendency toward excessive disclosure in early turns. Ultimately, this indicates that sustainable multiturn dialogue relies on a balanced information release rhythm, and avoiding early information saturation might be more conducive to dialogue stability.</p><table-wrap id="t10" position="float"><label>Table 10.</label><caption><p>Average conversation turns by different methods</p></caption><table id="table10" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Method</td><td align="left" valign="top">Values, mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">DeepSeek-AI&#x2013;based</td><td align="char" char="." valign="top">6.333 (0.882)</td></tr><tr><td align="left" valign="top">GPT-4o&#x2013;based</td><td align="char" char="." valign="top">11.5 (4.950)</td></tr><tr><td align="left" valign="top">Qwen3-32B&#x2013;based</td><td align="char" char="." valign="top">13 (4.243)</td></tr><tr><td align="left" valign="top">Single-LLM-Qwen3-32B</td><td align="char" char="." valign="top">7.5 (1.732)</td></tr></tbody></table></table-wrap><fig position="float" id="figure14"><label>Figure 14.</label><caption><p>Statistical distribution of reply lengths across different methods, illustrating the average length alongside minimum and maximum values.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig14.png"/></fig><fig position="float" id="figure15"><label>Figure 15.</label><caption><p>Information accumulation curve.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig15.png"/></fig></sec></sec><sec id="s3-3"><title>Multiagent Workflow Instance</title><p><xref ref-type="fig" rid="figure16">Figure 16</xref> exemplifies the multiagent workflow for VP construction and response generation. Upon case confirmation by medical educators, agents collaborate to develop a tailored VP based on specified requirements. These agents construct patient-specific information aligning with the case&#x2019;s core features. The example illustrates a middle-aged male patient with clinical anxiety, cautiousness, dependency, and health concern. Concurrently, the information processing agent segments the case report and stores clinical data for real-time retrieval during student interactions.</p><p>When a simulated student inquiry asks, for instance, &#x201C;Where have you been feeling uncomfortable recently?,&#x201D; the VP response framework activates. The system classifies this query as high quality and open ended, which triggers the selection of an optimized prompt to generate a relevant response. Simultaneously, the system uses clinical reasoning to connect the query to the most relevant case modules, such as the patient&#x2019;s chief complaint and present illness. Through structured traversal of minimal information units within these modules, the agent precisely identifies and retrieves the most clinically relevant data elements for response generation.</p><p>The response generation agent generates replies based on information input from other subagents. The reflection agent performs final-stage quality control by verifying response compliance with SP protocols. Responses satisfying all clinical and educational standards are directly delivered to users, while noncompliant responses undergo targeted modification to rectify specific deviations before being output.</p><fig position="float" id="figure16"><label>Figure 16.</label><caption><p>An input and output example with the multiagent framework.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e84747_fig16.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our study successfully developed a multiagent framework for SP simulation and evaluated its performance against single-LLM baseline setups. The principal findings reveal that the multiagent framework&#x2019;s structured design and the resulting disparity in prompt depth significantly improved VP simulation quality by enhancing factual consistency, reducing misleading responses, increasing role-playing expressivity, and demonstrating stable performance in terms of interaction efficiency and multiturn dialogues.</p><p>The high accuracy rate underscores the framework&#x2019;s clinical correctness, while factual consistency reflects semantic alignment with case reports. Together, these complementary metrics indicate that the VP responses are both semantically grounded and clinically reliable. While the GPT-4o&#x2013;based framework exhibited the highest factual consistency, it yielded the lowest accuracy. This implies a specific weakness in pinpointing targeted consultation information despite generating fluent, context-similar text. In contrast, Qwen3-32B&#x2013;based and DeepSeek-V3&#x2013;based achieved peak accuracy scores. Despite yielding a higher, albeit nonsignificant, medical accuracy compared to Qwen3-32B&#x2013;based, the single-LLM baseline exhibited significantly higher misleading rates than both the Qwen3-32B&#x2013;based and DeepSeek-V3&#x2013;based multiagent frameworks. These findings suggest that the absence of decomposed reasoning increases the susceptibility to hallucinated content.</p><p>Our framework reinforces patient characteristics by reasoning deeply about superficial traits, which helps create richer personality profiles and guides the LLM&#x2019;s persona performance, aligning with role-playing agent research on role identity activation [<xref ref-type="bibr" rid="ref44">44</xref>]. Evaluations showed that in role-playing capability and perceived educational utility, the Qwen3-32B framework performed comparably to the DeepSeek-V3 implementation, with both significantly outperforming the GPT-4o&#x2013;based one. It is noteworthy that our multiagent framework achieved a higher score in role-playing than the single-LLM baseline on the standard inquiry test set. This observation can be attributed to the deep role-response reasoning implemented in our subagents. Physicians also strongly endorsed our framework&#x2019;s educational utility with a high score. The inverse relationship between factual consistency and role-playing implies that maximizing similarity to clinical facts is a limited proxy for realistic simulation, as authentic patients rarely speak with such medical precision. Thus, factual consistency should serve as a necessary baseline for clinical correctness rather than a target to be maximized in isolation, avoiding rigid alignment that hinders naturalistic expression.</p><p>Regarding the exploratory analysis of the low-quality inquiry test set, specifically for terminology stacking and rigid application, the results indicate that the framework adheres to character constraints more effectively than the single-LLM baseline, yet exhibits limited discriminative behavioral dynamics when challenged by subtle cues such as leading questions. Overall, the results demonstrate that our framework enhances the VP&#x2019;s capability to maintain its persona compared to the single-LLM baseline.</p><p>As for factual consistency across diverse clinical departments, the framework maintained scores within the 0.7 to 0.85 range except pediatrics, with a lower CV than the single-LLM baseline, suggesting improved stability and potential for rapid construction without complex preprocessing [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref45">45</xref>]. The drop in pediatrics likely resulted from the simulation of the child patients who typically have limited communicative ability. This indicates that role definitions in specific departments require adjustment, such as simulating a guardian to better support realistic medical education. In terms of role-playing ability, preliminary LLM-as-judge evaluations suggested consistent cross-specialty performance, with limited score variations (<italic>&#x0394;</italic>&#x003C;2) observed in the appeal of role-playing. However, despite expert review of the generated test set, the use of GPT-4 for both creating the test set and the subsequent evaluation introduces a risk of self-preference bias or circular validation. Therefore, these role-playing metrics serve primarily as an exploratory reference, and definitive verification necessitates further validation by human specialists.</p><p>MASs face significant challenges regarding time and cost efficiency in practical applications. While our multiagent framework based on Qwen3-32B exhibited a higher average response time and higher average token consumption compared with the single-LLM baseline, its performance remained within acceptable limits, demonstrating that the efficiency of our framework was acceptable. Regarding multiturn dialogue, the average number of dialogue turns and the steady growth of accumulated information points across turns reflect the gradual information release pace essential to simulated clinical settings, highlighting the framework&#x2019;s potential for application.</p><p>Overall, our proposed framework demonstrated superior performance compared to the single-LLM baseline across most evaluation metrics, while maintaining an acceptable efficiency profile. Collectively, these findings highlight the importance of optimizing the trade-off between time efficiency and response quality when selecting model configurations for real-world deployment. While this study prioritizes real-time interaction fidelity, we acknowledge that alternative deployment settings may yield different trade-offs between reasoning depth and latency. In particular, a single LLM with reasoning enabled could offer stronger analytical performance in offline or low&#x2013;time-sensitivity educational scenarios, such as post hoc case analysis or self-paced learning. However, our preliminary evaluations indicate that enabling reasoning in large-scale models introduces substantial response latency and variability, which undermines conversational realism in interactive patient simulations. Consequently, the current framework adopts multiagent coordination under constrained inference settings as a more practical solution for real-time clinical education, while future work may explore adaptive reasoning strategies that dynamically balance responsiveness and analytical depth across diverse learning contexts.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>To further evaluate the performance of our framework, we compared its evaluation results with those of existing MASs designed for role-playing. Specifically, in terms of factual consistency, we compared our framework with an existing system Mediq [<xref ref-type="bibr" rid="ref46">46</xref>]. Our GPT-4o&#x2013;based framework achieved a score of 82.6% in medical dialogues, which is higher than Mediq&#x2019;s direct (55.9%) and instruct (62.8%) variants but lower than its fact-select approach. The difference may stem from our emphasis on patient personality, which can reduce textual similarity to the original case reports. Our framework also achieved significantly higher average scores in conversational ability and the appeal of role-playing compared to top-performing GPT-4&#x2013;based implementations in the CharacterEval benchmark [<xref ref-type="bibr" rid="ref43">43</xref>]. Although role consistency was slightly lower, likely because dialogues did not always require explicit trait expression, expert evaluations confirmed consistency with LLM-as-judge rankings, validating the Qwen3-32B&#x2013;based framework&#x2019;s robust role-playing capabilities, corroborating prior role-playing agent studies on role identity activation [<xref ref-type="bibr" rid="ref44">44</xref>].</p></sec><sec id="s4-3"><title>Educational Implications for Medical Training</title><p>From an educational perspective, the proposed multiagent VP framework offers several practical implications for clinical communication training, exemplifying the pivotal role of generative AI in advancing medical education [<xref ref-type="bibr" rid="ref47">47</xref>,<xref ref-type="bibr" rid="ref48">48</xref>]. First, by maintaining stable patient personas while handling low-quality or incomplete inquiries, which have been noted in [<xref ref-type="bibr" rid="ref49">49</xref>] to trigger hallucinations and response instability, the framework supports repeated, self-directed practice for medical students, particularly in early-stage communication skill acquisition. Second, the controllable and scalable nature of the system allows instructors to rapidly generate diverse SP scenarios without the logistical constraints associated with human SP recruitment and training [<xref ref-type="bibr" rid="ref50">50</xref>]. Third, the observed reduction in misleading responses compared with the single-LLM baseline is especially relevant for formative educational settings, where inaccurate patient feedback may negatively reinforce incorrect clinical reasoning. As Wen et al [<xref ref-type="bibr" rid="ref51">51</xref>] noted, single generative language models tend to entail inherent misleading risks in clinical-related interactions. Such behaviors directly align with the concern that misleading feedback from single LLMs may reinforce wrong clinical reasoning in formative education. Collectively, these findings suggest that the multiagent VP framework may serve as a complementary educational tool alongside traditional SP-based training, particularly in resource-limited or large-scale teaching contexts.</p></sec><sec id="s4-4"><title>Limitations</title><p>This study has several limitations. First, our evaluation focused on the overall performance of the multiagent framework rather than the isolated effectiveness of individual subagents. While this design demonstrates the advantages of collaborative agent-based modeling, it does not fully disentangle the specific contributions of each component, such as the memory management agent or the character mapping agent. Importantly, the comparison between the multiagent framework and the single-LLM baseline was not fully matched in terms of prompt depth. As shown in Table 1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, the single-LLM baseline relied primarily on teacher-provided patient settings, whereas the multiagent framework used a dedicated character mapping agent to expand these inputs into a richer personality profile, including behavioral tendencies, response thresholds, and defense mechanisms. This difference introduces a potential confounding factor, as the observed performance gap may reflect not only architectural advantages of multiagent coordination but also differences in prompt engineering depth. Consequently, future research should conduct more controlled ablation studies to isolate the effects of individual subagents and systematically evaluate the impact of prompt complexity, thereby disentangling architectural contributions from prompt design effects.</p><p>Second, although our study conducted a comprehensive evaluation, it did not include a prospective validation in real-world clinical settings. The assessment relied primarily on controlled scripts and LLM-as-a-judge methods, which may not fully capture the complexity of authentic student-patient interactions. While preliminary expert-based evaluation suggested high perceived educational utility, the long-term effectiveness and pedagogical impact of the framework remain uncertain. Further studies should incorporate randomized controlled trials in real teaching environments to rigorously assess whether the proposed system improves clinical communication skills compared with SPs or conventional training approaches.</p><p>Third, in addition to role-playing ability and multiturn dialogue evaluation, we assessed the response quality by incorporating specific natural language metrics, mirroring the essential SP function of conveying patient history and information during clinical dialogues. However, it is crucial to acknowledge the applicability boundary of these metrics that higher factual accuracy does not intrinsically mean greater patient simulation performance, as authentic patient behavior inherently involves forgetfulness, logical gaps, and potential fabrication. Future research should explore more comprehensive evaluation frameworks that better reflect real-world communication dynamics.</p><p>Finally, this study relied exclusively on Chinese clinical databases and expert evaluators. This linguistic homogeneity implies that the observed performance may stem from the base model&#x2019;s alignment with specific training data rather than inherent architectural advantages. To ensure the framework&#x2019;s generalizability and linguistic robustness, future validation on English and other multilingual datasets is essential.</p></sec><sec id="s4-5"><title>Conclusions</title><p>This study demonstrates that our multiagent framework, leveraging LLMs, provides a feasible method for SPs simulation. It facilitates natural and acceptable language-based interactions between VPs and users. In the context of medical education, this approach supports a promising pathway toward the development of scalable and effective communication training. Our multiagent framework demonstrated high factual consistency, response accuracy, and role-playing ability, while also demonstrating stable performance in terms of interaction efficiency and multiturn dialogues. This design overcomes the limitations of single-LLM baseline in medical education role-playing, effectively mitigating hallucinations and significantly reducing the rate of misinformation. Furthermore, our framework addresses limitations in case scenario updates and customization often found in existing multiagent VP systems, exhibiting high flexibility and scalability in certain clinical departments. This design shows promise in providing training-oriented support that complements theoretical coursework. Future studies should further explore its long-term efficacy and broader applicability in authentic medical teaching environments.</p></sec></sec></body><back><ack><p>This study was supported by grants from Zhejiang University, Peking Union Medical College, CAMS, and the Beijing Municipal Health Commission. The views expressed are solely those of the authors and do not represent any institution. The authors declare the use of generative artificial intelligence (GenAI) in the research and writing process. According to the GAIDeT (Generative Artificial Intelligence Delegation Taxonomy 2025), the following tasks were delegated to GenAI tools under full human supervision: code optimization, creation of algorithms for data analysis, summarizing text, and translation. The GenAI tool used was DeepSeek-R1. Responsibility for the final manuscript lies entirely with the authors. GenAI tools are not listed as authors and do not bear responsibility for the final outcomes.</p></ack><notes><sec><title>Funding</title><p>The research was supported by Intelligent Early Warning and Diagnosis-Treatment Decision Support System for Major Diseases (2025C01136), CAMS Innovation Fund for Medical Sciences (2025-I2M-KJ-002), Peking Union Medical College, Graduate Education and Teaching Reform Project in 2024 (2024yjsjg014), the Chinese Academy of Medical Sciences Innovation Fund for Medical Sciences (2021-I2M-1-056), Peking University Health Science Center Medical Education Research Funding Project (2025YB19), and Beijing Municipal Health Commission 2024 Residency Training Quality Improvement Program Residency Training (2024021).</p></sec><sec><title>Data Availability</title><p>The data generated or analyzed during this study can be provided by the corresponding author upon reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>YQ, XX, XL, and JL conceived the study and designed the work. XX, LY, and LJ contributed to the acquisition of data. YY and YW contributed to the statistical analysis. YQ, YW, and XX contributed to the interpretation of the data. YQ and XX wrote the original draft of the manuscript. All authors reviewed and approved the final version of the manuscript prior to submission and agreed to be accountable for all aspects of the work.</p></fn><fn fn-type="conflict"><p>YW is an employee of Hangzhou Joyrun Medical Science and Technology Co, Ltd. The other authors declare no conflicts of interest.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CV</term><def><p>coefficient of variation</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">MAS</term><def><p>multiagent system</p></def></def-item><def-item><term id="abb4">SP</term><def><p>standardized patient</p></def></def-item><def-item><term id="abb5">VP</term><def><p>virtual patient</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kramer</surname><given-names>AWM</given-names> </name><name name-style="western"><surname>D&#x00FC;sman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>LHC</given-names> </name><name name-style="western"><surname>Jansen</surname><given-names>JJM</given-names> </name><name name-style="western"><surname>Grol</surname><given-names>RPTM</given-names> </name><name name-style="western"><surname>van der Vleuten</surname><given-names>CPM</given-names> </name></person-group><article-title>Acquisition of communication skills in postgraduate training for general practice</article-title><source>Med Educ</source><year>2004</year><month>02</month><volume>38</volume><issue>2</issue><fpage>158</fpage><lpage>167</lpage><pub-id pub-id-type="doi">10.1111/j.1365-2923.2004.01747.x</pub-id><pub-id pub-id-type="medline">14871386</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>H</given-names> </name><name name-style="western"><surname>Naik</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Rao</surname><given-names>R</given-names> </name><name name-style="western"><surname>Petersen</surname><given-names>LA</given-names> </name></person-group><article-title>Reducing diagnostic errors through effective communication: harnessing the power of information technology</article-title><source>J Gen Intern Med</source><year>2008</year><month>04</month><volume>23</volume><issue>4</issue><fpage>489</fpage><lpage>494</lpage><pub-id pub-id-type="doi">10.1007/s11606-007-0393-z</pub-id><pub-id pub-id-type="medline">18373151</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Boissy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Windover</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Bokar</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Communication skills training for physicians improves patient satisfaction</article-title><source>J Gen Intern Med</source><year>2016</year><month>07</month><volume>31</volume><issue>7</issue><fpage>755</fpage><lpage>761</lpage><pub-id pub-id-type="doi">10.1007/s11606-016-3597-2</pub-id><pub-id pub-id-type="medline">26921153</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Challa</surname><given-names>KT</given-names> </name><name name-style="western"><surname>Sayed</surname><given-names>A</given-names> </name><name name-style="western"><surname>Acharya</surname><given-names>Y</given-names> </name></person-group><article-title>Modern techniques of teaching and learning in medical education: a descriptive literature review</article-title><source>MedEdPublish (2016)</source><year>2021</year><volume>10</volume><fpage>18</fpage><pub-id pub-id-type="doi">10.15694/mep.2021.000018.1</pub-id><pub-id pub-id-type="medline">38486533</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ende</surname><given-names>J</given-names> </name></person-group><article-title>Feedback in clinical medical education</article-title><source>JAMA</source><year>1983</year><month>08</month><day>12</day><volume>250</volume><issue>6</issue><fpage>777</fpage><lpage>781</lpage><pub-id pub-id-type="doi">10.1001/jama.1983.03340060055026</pub-id><pub-id pub-id-type="medline">6876333</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pei</surname><given-names>Y</given-names> </name></person-group><article-title>Safety and efficacy of probiotics in the prevention of necrotizing enterocolitis in premature and/or low-birthweight infants: a systematic review and meta-analysis</article-title><source>Transl Pediatr</source><year>2022</year><month>02</month><volume>11</volume><issue>2</issue><fpage>249</fpage><lpage>259</lpage><pub-id pub-id-type="doi">10.21037/tp-22-27</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>R&#x00F8;nning</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Bj&#x00F8;rkly</surname><given-names>S</given-names> </name></person-group><article-title>The use of clinical role-play and reflection in learning therapeutic communication skills in mental health education: an integrative review</article-title><source>Adv Med Educ Pract</source><year>2019</year><volume>10</volume><fpage>415</fpage><lpage>425</lpage><pub-id pub-id-type="doi">10.2147/AMEP.S202115</pub-id><pub-id pub-id-type="medline">31417328</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lovink</surname><given-names>A</given-names> </name><name name-style="western"><surname>Groenier</surname><given-names>M</given-names> </name><name name-style="western"><surname>van der Niet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Miedema</surname><given-names>H</given-names> </name><name name-style="western"><surname>Rethans</surname><given-names>JJ</given-names> </name></person-group><article-title>How simulated patients contribute to student learning in an authentic way, an interview study</article-title><source>Adv Simul</source><year>2024</year><month>01</month><day>11</day><volume>9</volume><issue>1</issue><fpage>4</fpage><pub-id pub-id-type="doi">10.1186/s41077-023-00277-w</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kyaw</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Posadzki</surname><given-names>P</given-names> </name><name name-style="western"><surname>Paddock</surname><given-names>S</given-names> </name><name name-style="western"><surname>Car</surname><given-names>J</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tudor Car</surname><given-names>L</given-names> </name></person-group><article-title>Effectiveness of digital education on communication skills among medical students: systematic review and meta-analysis by the digital health education collaboration</article-title><source>J Med Internet Res</source><year>2019</year><month>08</month><day>27</day><volume>21</volume><issue>8</issue><fpage>e12967</fpage><pub-id pub-id-type="doi">10.2196/12967</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Papanagnou</surname><given-names>D</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>XC</given-names> </name><etal/></person-group><article-title>Developing standardized patient-based cases for communication training: lessons learned from training residents to communicate diagnostic uncertainty</article-title><source>Adv Simul</source><year>2021</year><month>12</month><volume>6</volume><issue>1</issue><fpage>26</fpage><pub-id pub-id-type="doi">10.1186/s41077-021-00176-y</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Gr&#x00E9;visse</surname><given-names>C</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Florez</surname><given-names>H</given-names> </name><name name-style="western"><surname>Astudillo</surname><given-names>H</given-names> </name></person-group><article-title>RasPatient pi: a low-cost customizable LLM-based virtual standardized patient simulator</article-title><source>Appl Inform</source><year>2025</year><publisher-name>Springer Nature Switzerland</publisher-name><fpage>125</fpage><lpage>137</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-75147-9_9</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jia</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Study on the effect of simulation-based case teaching method on the preclinical teaching of tooth defects restoration</article-title><source>BMC Med Educ</source><year>2025</year><month>04</month><day>5</day><volume>25</volume><issue>1</issue><fpage>487</fpage><pub-id pub-id-type="doi">10.1186/s12909-025-07010-3</pub-id><pub-id pub-id-type="medline">40188128</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chan</surname><given-names>KY</given-names> </name><name name-style="western"><surname>Yuen</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Co</surname><given-names>M</given-names> </name></person-group><article-title>Using ChatGPT for medical education: the technical perspective</article-title><source>BMC Med Educ</source><year>2025</year><month>02</month><day>7</day><volume>25</volume><issue>1</issue><fpage>201</fpage><pub-id pub-id-type="doi">10.1186/s12909-025-06785-9</pub-id><pub-id pub-id-type="medline">39920711</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Raza</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jahangir</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Riaz</surname><given-names>MB</given-names> </name><name name-style="western"><surname>Saeed</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Sattar</surname><given-names>MA</given-names> </name></person-group><article-title>Industrial applications of large language models</article-title><source>Sci Rep</source><year>2025</year><month>04</month><day>21</day><volume>15</volume><issue>1</issue><fpage>13755</fpage><pub-id pub-id-type="doi">10.1038/s41598-025-98483-1</pub-id><pub-id pub-id-type="medline">40258923</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>B</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ku</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>A</given-names> </name><name name-style="western"><surname>Srikumar</surname><given-names>V</given-names> </name></person-group><article-title>Large language models are superpositions of all characters: attaining arbitrary role-play via self-alignment</article-title><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><fpage>7828</fpage><lpage>7840</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.423</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lampinen</surname><given-names>AK</given-names> </name><name name-style="western"><surname>Dasgupta</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>SCY</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Abbott</surname><given-names>D</given-names> </name></person-group><article-title>Language models, like humans, show content effects on reasoning tasks</article-title><source>PNAS Nexus</source><year>2024</year><month>07</month><volume>3</volume><issue>7</issue><fpage>gae233</fpage><pub-id pub-id-type="doi">10.1093/pnasnexus/pgae233</pub-id><pub-id pub-id-type="medline">39015546</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kononowicz</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Zary</surname><given-names>N</given-names> </name><name name-style="western"><surname>Edelbring</surname><given-names>S</given-names> </name><name name-style="western"><surname>Corral</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hege</surname><given-names>I</given-names> </name></person-group><article-title>Virtual patients--what are we talking about? A framework to classify the meanings of the term in healthcare education</article-title><source>BMC Med Educ</source><year>2015</year><month>02</month><day>1</day><volume>15</volume><issue>1</issue><fpage>11</fpage><pub-id pub-id-type="doi">10.1186/s12909-015-0296-3</pub-id><pub-id pub-id-type="medline">25638167</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schaekermann</surname><given-names>M</given-names> </name><name name-style="western"><surname>Palepu</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Towards conversational diagnostic artificial intelligence</article-title><source>Nature New Biol</source><year>2025</year><month>06</month><day>12</day><volume>642</volume><issue>8067</issue><fpage>442</fpage><lpage>450</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-08866-7</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johri</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jeong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tran</surname><given-names>BA</given-names> </name><etal/></person-group><article-title>An evaluation framework for clinical use of large language models in patient interaction tasks</article-title><source>Nat Med</source><year>2025</year><month>01</month><volume>31</volume><issue>1</issue><fpage>77</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03328-5</pub-id><pub-id pub-id-type="medline">39747685</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schmidgall</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ziaei</surname><given-names>R</given-names> </name><name name-style="western"><surname>Harris</surname><given-names>C</given-names> </name><name name-style="western"><surname>Reis</surname><given-names>E</given-names> </name><name name-style="western"><surname>Jopling</surname><given-names>J</given-names> </name><name name-style="western"><surname>Moor</surname><given-names>M</given-names> </name></person-group><article-title>AgentClinic: a multimodal agent benchmark to evaluate AI in simulated clinical environments</article-title><source>arXiv</source><comment>Preprint posted online on  May 25, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2405.07960</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>Team B-M</collab><name name-style="western"><surname>Dou</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Baichuan-M2: scaling medical capability with large verifier system</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 2, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2509.02208</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sardesai</surname><given-names>N</given-names> </name><name name-style="western"><surname>Russo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sardesai</surname><given-names>A</given-names> </name></person-group><article-title>Utilizing generative conversational artificial intelligence to create simulated patient encounters: a pilot study for anaesthesia training</article-title><source>Postgrad Med J</source><year>2024</year><month>03</month><day>18</day><volume>100</volume><issue>1182</issue><fpage>237</fpage><lpage>241</lpage><pub-id pub-id-type="doi">10.1093/postmj/qgad137</pub-id><pub-id pub-id-type="medline">38240054</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borg</surname><given-names>A</given-names> </name><name name-style="western"><surname>Georg</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jobs</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Virtual patient simulations using social robotics combined with large language models for clinical reasoning training in medical education: mixed methods study</article-title><source>J Med Internet Res</source><year>2025</year><month>03</month><day>3</day><volume>27</volume><fpage>e63312</fpage><pub-id pub-id-type="doi">10.2196/63312</pub-id><pub-id pub-id-type="medline">40053778</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yamamoto</surname><given-names>A</given-names> </name><name name-style="western"><surname>Koda</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ogawa</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Enhancing medical interview skills through AI-simulated patient interactions: nonrandomized controlled trial</article-title><source>JMIR Med Educ</source><year>2024</year><month>09</month><day>23</day><volume>10</volume><fpage>e58753</fpage><pub-id pub-id-type="doi">10.2196/58753</pub-id><pub-id pub-id-type="medline">39312284</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Achananuparp</surname><given-names>P</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>EP</given-names> </name></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Ku</surname><given-names>LW</given-names> </name><name name-style="western"><surname>Martins</surname><given-names>A</given-names> </name><name name-style="western"><surname>Srikumar</surname><given-names>V</given-names> </name></person-group><article-title>Speaker verification in agent-generated conversations</article-title><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand</conf-loc><fpage>5655</fpage><lpage>5676</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.307</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ahn</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>TimeChara: evaluating point-in-time character hallucination of role-playing large language models</article-title><conf-name>Findings of the Association for Computational Linguistics ACL 2024</conf-name><conf-date>Aug 11-16, 2024</conf-date><conf-loc>Bangkok, Thailand and virtual meeting</conf-loc><fpage>3291</fpage><lpage>3325</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.findings-acl.197</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>H</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Globerson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mackey</surname><given-names>L</given-names> </name><name name-style="western"><surname>Belgrave</surname><given-names>D</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>A</given-names> </name><name name-style="western"><surname>Paquet</surname><given-names>U</given-names> </name><name name-style="western"><surname>Tomczak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>C</given-names> </name></person-group><article-title>RoleAgent: building, interacting, and benchmarking high-quality role-playing agents from scripts</article-title><conf-name>Advances in Neural Information Processing Systems 37</conf-name><conf-date>Dec 10-15, 2024</conf-date><conf-loc>Vancouver, BC, Canada</conf-loc><fpage>49403</fpage><lpage>49428</lpage><pub-id pub-id-type="doi">10.52202/079017-1563</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dorri</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kanhere</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Jurdak</surname><given-names>R</given-names> </name></person-group><article-title>Multi-agent systems: a survey</article-title><source>IEEE Access</source><year>2018</year><volume>6</volume><fpage>28573</fpage><lpage>28593</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2018.2831228</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Large language model based multi-agents: a survey of progress and challenges</article-title><conf-name>Thirty-Third International Joint Conference on Artificial Intelligence</conf-name><conf-date>Aug 3-9, 2024</conf-date><pub-id pub-id-type="doi">10.24963/ijcai.2024/890</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Steenstra</surname><given-names>I</given-names> </name><name name-style="western"><surname>Nouraei</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bickmore</surname><given-names>TW</given-names> </name></person-group><article-title>Scaffolding empathy: training counselors with simulated patients and utterance-level performance visualizations</article-title><conf-name>CHI &#x2019;25: Proceedings of the 2025 CHI Conference on Human Factors in Computing Systems</conf-name><conf-date>Apr 26, 2025</conf-date><conf-loc>Yokohama Japan</conf-loc><fpage>1</fpage><lpage>22</lpage><pub-id pub-id-type="doi">10.1145/3706598.3714014</pub-id><pub-id pub-id-type="medline">40245409</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>J</given-names> </name><name name-style="western"><surname>Li</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Simulated patient systems powered by large language model-based AI agents offer potential for transforming medical education</article-title><source>Commun Med</source><comment>Preprint posted online on  Sep 27, 2024</comment><pub-id pub-id-type="doi">10.1038/s43856-025-01283-x</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bowers</surname><given-names>P</given-names> </name><name name-style="western"><surname>Graydon</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lau</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Tomlin</surname><given-names>D</given-names> </name></person-group><article-title>Artificial intelligence-driven virtual patients for communication skill development in healthcare students</article-title><source>AJET</source><year>2024</year><month>06</month><day>7</day><volume>40</volume><issue>3</issue><fpage>39</fpage><lpage>57</lpage><pub-id pub-id-type="doi">10.14742/ajet.9307</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>G</given-names> </name><name name-style="western"><surname>Dingjie</surname><given-names>S</given-names> </name><etal/></person-group><person-group person-group-type="editor"><name name-style="western"><surname>Duh</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gomez</surname><given-names>H</given-names> </name><name name-style="western"><surname>Bethard</surname><given-names>S</given-names> </name></person-group><article-title>CMB: a comprehensive medical benchmark in chinese</article-title><conf-name>Proceedings of the 2024 Conference of the North American Chapter of the Association for Computational Linguistics</conf-name><conf-date>Jun 2024 to 2024</conf-date><conf-loc>Mexico City, Mexico</conf-loc><fpage>6184</fpage><lpage>6205</lpage><pub-id pub-id-type="doi">10.18653/v1/2024.naacl-long.343</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McCrae</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Costa</surname><given-names>PT</given-names> </name></person-group><article-title>Validation of the five-factor model of personality across instruments and observers</article-title><source>J Pers Soc Psychol</source><year>1987</year><month>01</month><volume>52</volume><issue>1</issue><fpage>81</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1037//0022-3514.52.1.81</pub-id><pub-id pub-id-type="medline">3820081</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Haring</surname><given-names>CM</given-names> </name><name name-style="western"><surname>Cools</surname><given-names>BM</given-names> </name><name name-style="western"><surname>van Gurp</surname><given-names>PJM</given-names> </name><name name-style="western"><surname>van der Meer</surname><given-names>JWM</given-names> </name><name name-style="western"><surname>Postma</surname><given-names>CT</given-names> </name></person-group><article-title>Observable phenomena that reveal medical students&#x2019; clinical reasoning ability during expert assessment of their history taking: a qualitative study</article-title><source>BMC Med Educ</source><year>2017</year><month>08</month><day>29</day><volume>17</volume><issue>1</issue><fpage>147</fpage><pub-id pub-id-type="doi">10.1186/s12909-017-0983-3</pub-id><pub-id pub-id-type="medline">28851340</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maguire</surname><given-names>GP</given-names> </name><name name-style="western"><surname>Rutter</surname><given-names>DR</given-names> </name></person-group><article-title>History-taking for medical students. I-Deficiencies in performance</article-title><source>Lancet</source><year>1976</year><month>09</month><day>11</day><volume>2</volume><issue>7985</issue><fpage>556</fpage><lpage>558</lpage><pub-id pub-id-type="doi">10.1016/s0140-6736(76)91804-3</pub-id><pub-id pub-id-type="medline">60632</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alrasheedi</surname><given-names>AA</given-names> </name></person-group><article-title>Deficits in history taking skills among final year medical students in a family medicine course: a study from KSA</article-title><source>J Taibah Univ Med Sci</source><year>2018</year><month>10</month><volume>13</volume><issue>5</issue><fpage>415</fpage><lpage>421</lpage><pub-id pub-id-type="doi">10.1016/j.jtumed.2018.07.001</pub-id><pub-id pub-id-type="medline">31435357</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Qwen3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2505.09388</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><collab>DeepSeek-AI</collab><name name-style="western"><surname>Liu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>B</given-names> </name><name name-style="western"><surname>Xue</surname><given-names>B</given-names> </name><etal/></person-group><article-title>DeepSeek-V3 technical report</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 27, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.19437</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="web"><article-title>GPT-4o system card</article-title><source>OpenAI</source><year>2024</year><access-date>2024-08-07</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://cdn.openai.com/gpt-4o-system-card.pdf">https://cdn.openai.com/gpt-4o-system-card.pdf</ext-link></comment></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>CharacterEval: a chinese benchmark for role-playing conversational agent evaluation</article-title><year>2024</year><month>01</month><day>9</day><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1</conf-name><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.638</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Xiao</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>P</given-names> </name><name name-style="western"><surname>Muennighoff</surname><given-names>N</given-names> </name><name name-style="western"><surname>Lian</surname><given-names>D</given-names> </name><name name-style="western"><surname>Nie</surname><given-names>JY</given-names> </name></person-group><article-title>C-pack: packed resources for general chinese embeddings</article-title><source>arXiv</source><comment>Preprint posted online on  Sep 14, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2309.07597</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Tu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tian</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>CharacterEval: a Chinese benchmark for role-playing conversational agent evaluation</article-title><year>2024</year><month>01</month><day>2</day><conf-name>Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1</conf-name><pub-id pub-id-type="doi">10.18653/v1/2024.acl-long.638</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Thinking in character: advancing role-playing agents with role-aware reasoning</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 2, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2506.01748</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>L</given-names> </name></person-group><article-title>Leveraging large language model as simulated patients for clinical education</article-title><source>arXiv</source><comment>Preprint posted online on  Apr 13, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2404.13066</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Balachandran</surname><given-names>V</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ilgen</surname><given-names>J</given-names> </name><etal/></person-group><article-title>MediQ: question-asking llms and a benchmark for reliable interactive clinical reasoning</article-title><year>2024</year><month>06</month><day>3</day><conf-name>Advances in Neural Information Processing Systems 37</conf-name><pub-id pub-id-type="doi">10.52202/079017-0908</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jowsey</surname><given-names>T</given-names> </name><name name-style="western"><surname>Stokes-Parish</surname><given-names>J</given-names> </name><name name-style="western"><surname>Singleton</surname><given-names>R</given-names> </name><name name-style="western"><surname>Todorovic</surname><given-names>M</given-names> </name></person-group><article-title>Medical education empowered by generative artificial intelligence large language models</article-title><source>Trends Mol Med</source><year>2023</year><month>12</month><volume>29</volume><issue>12</issue><fpage>971</fpage><lpage>973</lpage><pub-id pub-id-type="doi">10.1016/j.molmed.2023.08.012</pub-id><pub-id pub-id-type="medline">37718142</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tran</surname><given-names>M</given-names> </name><name name-style="western"><surname>Balasooriya</surname><given-names>C</given-names> </name><name name-style="western"><surname>Jonnagaddala</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Situating governance and regulatory concerns for generative artificial intelligence and large language models in medical education</article-title><source>npj Digit Med</source><year>2025</year><month>05</month><day>27</day><volume>8</volume><issue>1</issue><fpage>315</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01721-z</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lebai Lutfi</surname><given-names>S</given-names> </name></person-group><article-title>Large language model-based virtual patient systems for history-taking in medical education: comprehensive systematic review</article-title><source>JMIR Med Inform</source><year>2026</year><month>01</month><day>2</day><volume>14</volume><fpage>e79039</fpage><pub-id pub-id-type="doi">10.2196/79039</pub-id><pub-id pub-id-type="medline">41481915</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>ML</given-names> </name><name name-style="western"><surname>Chao</surname><given-names>LF</given-names> </name><name name-style="western"><surname>Hung</surname><given-names>YT</given-names> </name></person-group><article-title>Training students to serve as standardized patients in an objective structured clinical examination is feasible: a mixed-methods study</article-title><source>Nurse Educ Pract</source><year>2024</year><month>08</month><volume>79</volume><fpage>104069</fpage><pub-id pub-id-type="doi">10.1016/j.nepr.2024.104069</pub-id><pub-id pub-id-type="medline">39053150</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Chuang</surname><given-names>YN</given-names> </name><etal/></person-group><article-title>Context matching is not reasoning when performing generalized clinical evaluation of generative language models</article-title><source>NPJ Digit Med</source><year>2025</year><month>12</month><day>27</day><volume>9</volume><issue>1</issue><fpage>71</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-02253-2</pub-id><pub-id pub-id-type="medline">41455812</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Prompts.</p><media xlink:href="jmir_v28i1e84747_app1.docx" xlink:title="DOCX File, 29 KB"/></supplementary-material></app-group></back></article>