<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e73226</article-id><article-id pub-id-type="doi">10.2196/73226</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Application of Large Language Models in Stroke Rehabilitation Health Education: 2-Phase Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Qiang</surname><given-names>Shiqi</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Zhang</surname><given-names>Haitao</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Liao</surname><given-names>Yang</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Yue</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gu</surname><given-names>Yanfen</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Yiyan</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xu</surname><given-names>Zehui</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Shi</surname><given-names>Hui</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff7">7</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Han</surname><given-names>Nuo</given-names></name><degrees>BSc</degrees><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Yu</surname><given-names>Haiping</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib></contrib-group><aff id="aff1"><institution>Shanghai East Hospital, School of Medicine, Tongji University</institution><addr-line>No. 1800 Yuntai Road</addr-line><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Emergency and Critical Care, Shanghai East Hospital, School of Medicine, Tongji University</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff3"><institution>Neurological Rehabilitation Center, Shanghai Sunshine Rehabilitation Center, School of Medicine, Tongji University</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff4"><institution>Department of Neurology, Shanghai East Hospital, School of Medicine, Tongji University</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff5"><institution>Department of Gastrointestinal Endoscopy, Shanghai East Hospital, School of Medicine, Tongji University</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff6"><institution>Department of Breast Diseases, Yueyang Hospital of Integrated Traditional Chinese and Western Medicine, Shanghai University of Traditional Chinese Medicine</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff7"><institution>Department of Nursing, Zhongshan Hospital, Fudan University</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff8"><institution>School of Acupuncture-Moxibustion and Tuina, Shanghai University of Traditional Chinese Medicine</institution><addr-line>Shanghai</addr-line><country>China</country></aff><aff id="aff9"><institution>Department of Nursing, Shanghai East Hospital, School of Medicine, Tongji University</institution><addr-line>No. 1800 Yuntai Road</addr-line><addr-line>Shanghai</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Costa</surname><given-names>Davide</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gupta</surname><given-names>Gaurav Kumar</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Haiping Yu, PhD, Department of Nursing, Shanghai East Hospital, School of Medicine, Tongji University, No. 1800 Yuntai Road, Shanghai, 200120, China, 86 18964538997; <email>pingping670@sina.com</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>22</day><month>7</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e73226</elocation-id><history><date date-type="received"><day>28</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>24</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>13</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Shiqi Qiang, Haitao Zhang, Yang Liao, Yue Zhang, Yanfen Gu, Yiyan Wang, Zehui Xu, Hui Shi, Nuo Han, Haiping Yu. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 22.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e73226"/><related-article related-article-type="correction-forward" id="v27e84717" ext-link-type="doi" xlink:href="84717" xlink:title="This is a corrected version. See correction statement in" vol="27" page="e84717" xlink:type="simple">https://www.jmir.org/2025/1/e84717</related-article><abstract><sec><title>Background</title><p>Stroke is a leading cause of disability and death worldwide, with home-based rehabilitation playing a crucial role in improving patient prognosis and quality of life. Traditional health education often lacks precision, personalization, and accessibility. In contrast, large language models (LLMs) are gaining attention for their potential in medical health education, owing to their advanced natural language processing capabilities. However, the effectiveness of LLMs in home-based stroke rehabilitation remains uncertain.</p></sec><sec><title>Objective</title><p>This study evaluates the effectiveness of 4 LLMs&#x2014;ChatGPT-4, MedGo, Qwen, and ERNIE Bot&#x2014;selected for their diversity in model type, clinical relevance, and accessibility at the time of study design in home-based stroke rehabilitation. The aim is to offer patients with stroke more precise and secure health education pathways while exploring the feasibility of using LLMs to guide health education.</p></sec><sec sec-type="methods"><title>Methods</title><p>In the first phase of this study, a literature review and expert interviews identified 15 common questions and 2 clinical cases relevant to patients with stroke in home-based rehabilitation. These were input into 4 LLMs for simulated consultations. Six medical experts (2 clinicians, 2 nursing specialists, and 2 rehabilitation therapists) evaluated the LLM-generated responses using a Likert 5-point scale, assessing accuracy, completeness, readability, safety, and humanity. In the second phase, the top 2 performing models from phase 1 were selected. Thirty patients with stroke undergoing home-based rehabilitation were recruited. Each patient asked both models 3 questions, rated the responses using a satisfaction scale, and assessed readability, text length, and recommended reading age using a Chinese readability analysis tool. Data were analyzed using one-way ANOVA, post hoc Tukey Honestly Significant Difference tests, and paired <italic>t</italic> tests.</p></sec><sec sec-type="results"><title>Results</title><p>The results revealed significant differences across the 4 models in 5 dimensions: accuracy (<italic>P</italic>=.002), completeness (<italic>P</italic>&#x003C;.001), readability (<italic>P</italic>=.04), safety (<italic>P</italic>=.007), and humanity (<italic>P</italic>&#x003C;.001). ChatGPT-4 outperformed all models in each dimension, with scores for accuracy (mean 4.28, SD 0.84), completeness (mean 4.35, SD 0.75), readability (mean 4.28, SD 0.85), safety (mean 4.38, SD0.81), and user-friendliness (mean 4.65, SD 0.66). MedGo excelled in accuracy (mean 4.06, SD 0.78) and completeness (mean 4.06, SD 0.74). Qwen and ERNIE Bot scored significantly lower across all 5 dimensions than ChatGPT-4 and MedGo. ChatGPT-4 generated the longest responses (mean 1338.35, SD 236.03) and had the highest readability score (mean 12.88). In the second phase, ChatGPT-4 performed the best overall, while MedGo provided the clearest responses.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs, particularly ChatGPT-4 and MedGo, demonstrated promising performance in home-based stroke rehabilitation education. However, discrepancies between expert and patient evaluations highlight the need for improved alignment with patient comprehension and expectations. Enhancing clinical accuracy, readability, and oversight mechanisms will be essential for future real-world integration.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>stroke</kwd><kwd>artificial intelligence</kwd><kwd>home rehabilitation</kwd><kwd>health education.</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Stroke is a leading cause of mortality and disability among middle-aged and older adult individuals worldwide. According to the World Health Organization, more than 15 million people experience a stroke annually, resulting in approximately 6 million deaths and leaving millions more with varying degrees of long-term disability [<xref ref-type="bibr" rid="ref1">1</xref>]. With the global aging population, stroke prevalence continues to rise, particularly in low-income and middle-income countries, where the disease burden is increasing, posing a significant public health challenge [<xref ref-type="bibr" rid="ref2">2</xref>]. Stroke rehabilitation is critical for improving patient prognosis and quality of life [<xref ref-type="bibr" rid="ref1">1</xref>]. Home-based rehabilitation, offering convenience and cost-effectiveness, has gained significant attention in recent years. However, successful home rehabilitation requires active patient engagement, effective family support, and professional guidance [<xref ref-type="bibr" rid="ref3">3</xref>]. Comprehensive health education and rehabilitation instructions are essential to ensure that patients adhere to appropriate recovery protocols at home. Traditional health education methods, such as printed materials, books, and verbal instructions, often face challenges such as delayed information dissemination, inconsistent interpretation, and a lack of personalized support, which can hinder rehabilitation outcomes.</p><p>In recent years, large language models (LLMs) have rapidly advanced, gaining significant attention in the medical and health care fields due to their advancements in natural language processing. LLMs possess powerful language comprehension and generation capabilities, enabling them to deliver personalized, easily understandable health guidance tailored to patients&#x2019; needs [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>]. Existing studies have explored LLM applications in patient education, mental health interventions, and health management [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. However, LLMs vary considerably in terms of medical accuracy, comprehensiveness, safety, and readability.</p><p>To address these challenges, Shanghai East Hospital developed MedGo in 2024, a specialized Chinese medical LLM designed to assist health care professionals in clinical decision-making and medical consultations. MedGo has demonstrated exceptional capabilities in medical task processing, ranking among the top models in the Chinese Biomedical Language Understanding Evaluation benchmark and excelling in medical question-answering assessments [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>To evaluate the practical effectiveness of LLMs in this context, we selected 4 representative models for comparison: ChatGPT-4, MedGo, Qwen, and ERNIE Bot. These models were chosen based on their diversity in orientation (general-purpose vs medical-specific), availability to Chinese users, and technological maturity at the time of study design. ChatGPT-4 and Qwen are general-purpose LLMs widely recognized for their advanced dialogue and multilingual capabilities. ERNIE Bot, developed by Baidu, is a leading domestic model with robust Chinese language processing. MedGo, as a medical-specific model developed within the Chinese clinical context, brings unique relevance to health care applications. This selection offers a balanced foundation for comparing the performance of LLMs in stroke rehabilitation health education.</p><p>The application of LLMs in the medical field is still in its early stages [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>], with limited exploration of how to effectively integrate them into home rehabilitation for patients with stroke. Most existing studies [<xref ref-type="bibr" rid="ref11">11</xref>] primarily focus on general-purpose LLMs (eg, ChatGPT and Google Bard), overlooking the potential of specialized medical models in health care [<xref ref-type="bibr" rid="ref12">12</xref>]. Furthermore, comprehensive comparisons of various LLMs in medical applications are scarce.</p><p>This study aims to evaluate the effectiveness of multiple LLMs, including specialized medical models, in supporting home rehabilitation for patients with stroke. By offering a more precise and safer rehabilitation education pathway, this research seeks to advance the application of LLMs in health care, providing a more efficient and scientifically sound solution for home rehabilitation for patients with stroke. The findings have significant academic and practical implications for promoting the adoption of LLMs in health care.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This study was conducted from January 5 to February 22, 2025. Using both quantitative and qualitative analyses, the study aimed to evaluate the effectiveness of LLMs in home-based stroke rehabilitation education.</p></sec><sec id="s2-2"><title>Study Subjects</title><p>This study compared 4 representative LLMs&#x2014;ChatGPT-4, MedGo, Qwen-Max, and ERNIE Bot V3.5&#x2014;based on their accessibility, medical relevance, and model diversity (<xref ref-type="table" rid="table1">Table 1</xref>).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Study models.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Model</td><td align="left" valign="top">Version</td><td align="left" valign="top">Source</td><td align="left" valign="top">Medical-specific</td><td align="left" valign="top">Open source</td></tr></thead><tbody><tr><td align="left" valign="top">Chat-GPT</td><td align="left" valign="top">V4.0</td><td align="left" valign="top">OpenAI (USA&#xFF09;</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">MedGo</td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></td><td align="left" valign="top">Laboratory of Biomedical Artificial Intelligence (China)</td><td align="left" valign="top">Yes</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">Qwen</td><td align="left" valign="top">Qwen-Max</td><td align="left" valign="top">Alibaba (China)</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td></tr><tr><td align="left" valign="top">ERNIE Bot</td><td align="left" valign="top">V3.5</td><td align="left" valign="top">Baidu (China)</td><td align="left" valign="top">No</td><td align="left" valign="top">No</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>Not available.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Phase 1</title><sec id="s2-3-1"><title>Questionnaire Design</title><p>The study aimed to evaluate the effectiveness of LLMs in home-based stroke rehabilitation education. By comparing the performance of responses from 4 different LLMs and incorporating expert ratings and patient feedback [<xref ref-type="bibr" rid="ref13">13</xref>], we explored their practical application in the rehabilitation process. The detailed procedure is outlined in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Research design workflow diagram.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73226_fig01.png"/></fig><p>Common inquiries about home rehabilitation for patients with stroke from the past 3 years were done and summarized from web-based medical platforms, including &#x201C;DingXiangYuan,&#x201D; &#x201C;HaoDF,&#x201D; &#x201C;Baidu Health Ask a Doctor,&#x201D; and &#x201C;AliHealth.&#x201D; Additionally, international stroke rehabilitation guidelines [<xref ref-type="bibr" rid="ref14">14</xref>]&#x2014;such as those from the American Heart Association, the American Stroke Association, and the Chinese Stroke Association&#x2014;were reviewed to identify key concerns during the postoperative rehabilitation phase. Based on this information, an initial set of questions was selected.</p><p>Using this feedback and incorporating the clinical experience of neurorehabilitation nursing experts, a set of 15 targeted questions and 2 common home rehabilitation case scenarios were compiled. Case 1 focused on improving limb mobility and self-care abilities in daily life, with an emphasis on blood pressure control. Case 2 addressed not only limb and gait training but also speech and swallowing rehabilitation, diabetes management, and shoulder pain relief. The questionnaire is shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref></p></sec><sec id="s2-3-2"><title>Model Testing</title><p>To enhance the professionalism and specificity of LLM responses, ensuring that they address questions from a targeted perspective and align closely with real-world applications, a standardized prompt was added before each model-generated response. The prompt instructed the following:</p><disp-quote><p>Assume you are an experienced rehabilitation specialist responsible for assessing the home rehabilitation needs of stroke patients and providing personalized advice based on their specific conditions. Use your professional knowledge to provide detailed answers to the following questions. Note that the inquirer is a patient or caregiver with no medical background. Ensure your responses include clear explanations and reference relevant medical evidence to aid understanding.</p></disp-quote><p>Fifteen home rehabilitation questions and 2 clinical cases related to stroke were input into 4 different LLMs. Each model received the inputs 3 times, with each iteration conducted in a new conversation to eliminate prior chat history, allowing for an assessment of response consistency. The responses from all models were recorded in plain text format [<xref ref-type="bibr" rid="ref15">15</xref>]. A single-blind method was applied: the 3 responses from each model were randomized and grouped into 4 sets of questions, which were then compiled into a questionnaire for expert evaluation.</p></sec><sec id="s2-3-3"><title>Expert Evaluation</title><p>A Likert 5-point rating scale was used to assess the outputs of the 4 LLMs across five dimensions: accuracy, completeness, readability, safety, and humanity. (1) <italic>Accuracy</italic>: It evaluates whether the model-generated information aligns with scientific knowledge and medical facts. (2) <italic>Completeness</italic>: It assesses whether the model&#x2019;s responses fully cover all relevant aspects of the question or case scenario. (3) <italic>Readability</italic>: It determines whether the model&#x2019;s language is clear, concise, and easy to understand. (4) <italic>Safety</italic>: It examines whether the model provides safe and appropriate recommendations without potential risks. (5) <italic>Humanity</italic>: It assesses whether the model&#x2019;s responses consider patients&#x2019; emotional needs, dignity, and individual differences while offering care and support.</p><p>Experts could provide brief explanations in the comment section if they had concerns about any response. The detailed definitions of the Likert 5-point rating scale are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>A total of 6 stroke rehabilitation experts (<xref ref-type="table" rid="table2">Table 2</xref>) were selected from tertiary general hospitals and specialized rehabilitation centers in Shanghai. The expert panel included 2 stroke center medical specialists, 2 neurorehabilitation nursing specialists, and 2 rehabilitation therapists, grouped into 3 teams [<xref ref-type="bibr" rid="ref16">16</xref>]. This balanced composition ensured coverage of the key professional perspectives involved in home-based poststroke care while maintaining evaluation feasibility.</p><p>To minimize potential evaluator bias, all experts were blinded to the identities and affiliations of the 4 LLMs. The model outputs were anonymized and randomly ordered, with 3 independent rounds of scoring using distinct response samples in each round. Following a blind evaluation protocol, the experts assessed the outputs of the 4 LLMs across 5 dimensions. Prior to formal assessment, all experts attended a standardized training session and received a structured evaluation manual, which included detailed explanations of each scoring dimension, illustrative examples, and response interpretation guidelines. These measures aimed to promote consistent understanding of the evaluation framework and reduce interrater variability. Experts completed their assessments independently and were instructed not to communicate with one another during the scoring process. Additionally, the 6 experts conducted an interpretability analysis of the responses generated by the 4 LLMs. The expert scoring data can be found in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Expert profiles.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Field of expertise</td><td align="left" valign="bottom">Expert</td><td align="left" valign="bottom">Degree</td><td align="left" valign="bottom">Title</td></tr></thead><tbody><tr><td align="left" valign="top">Clinical medicine</td><td align="left" valign="top">Expert 1</td><td align="left" valign="top">PhD</td><td align="left" valign="top">Chief physician</td></tr><tr><td align="left" valign="top">Clinical medicine</td><td align="left" valign="top">Expert 2</td><td align="left" valign="top">PhD</td><td align="left" valign="top">Chief physician</td></tr><tr><td align="left" valign="top">Nursing</td><td align="left" valign="top">Expert 3</td><td align="left" valign="top">PhD</td><td align="left" valign="top">Chief nurse</td></tr><tr><td align="left" valign="top">Nursing</td><td align="left" valign="top">Expert 4</td><td align="left" valign="top">Bachelor&#x2019;s</td><td align="left" valign="top">Associate chief nurse</td></tr><tr><td align="left" valign="top">Rehabilitation medicine</td><td align="left" valign="top">Expert 5</td><td align="left" valign="top">Master&#x2019;s</td><td align="left" valign="top">Senior Therapist</td></tr><tr><td align="left" valign="top">Rehabilitation medicine</td><td align="left" valign="top">Expert 6</td><td align="left" valign="top">Master&#x2019;s</td><td align="left" valign="top">Senior therapist</td></tr></tbody></table></table-wrap></sec></sec><sec id="s2-4"><title>Phase 2</title><p>Thirty patients with stroke were recruited in Shanghai. The 2 top-performing models from phase 1 were selected for interaction with the patients in a real clinical setting. Each patient asked both models 3 questions related to home-based stroke rehabilitation. The researchers recorded the responses and independently rated them using a satisfaction scale. Patient questions and scoring data can be found in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. See <xref ref-type="other" rid="box1">Textbox 1</xref> for the inclusion and exclusion criteria.</p><boxed-text id="box1"><title> Inclusion and exclusion criteria.</title><p><bold>Inclusion criteria:</bold></p><list list-type="bullet"><list-item><p>Patients diagnosed with stroke (including ischemic or hemorrhagic stroke) who meet the diagnostic criteria in the <italic>2021 China Stroke Prevention and Treatment Guidelines</italic> and have been confirmed by computed tomography or magnetic resonance imaging.</p></list-item><list-item><p>Patients in the rehabilitation phase (1&#x2010;12 months post discharge, with ongoing rehabilitation needs).</p></list-item><list-item><p>Patients undergoing home-based rehabilitation (not long-term hospitalized).</p></list-item><list-item><p>Patients with basic communication skills, able to express their needs accurately (or with a family member to assist in communication).</p></list-item></list><p><bold>Exclusion criteria:</bold></p><list list-type="bullet"><list-item><p>Patients with severe cognitive impairments or those unable to accurately express their needs.</p></list-item><list-item><p>Patients unable to undergo home-based rehabilitation (eg, those requiring long-term hospitalization due to the severity of their condition).</p></list-item><list-item><p>Patients with other serious comorbidities (eg, end-stage cancer and severe heart failure) that would interfere with the rehabilitation plan.</p></list-item></list></boxed-text><p>The assessment criterion was patient satisfaction, based on the following specific criteria (<xref ref-type="other" rid="box2">Textbox 2</xref>):</p><boxed-text id="box2"><title> Patient satisfaction rating scale.</title><p>Please select the option that best reflects your overall feeling toward the model&#x2019;s response.</p><p>1 Point. Very dissatisfied: The response from the model is very unsatisfactory, completely failing to meet my needs.</p><p>2 Points. Dissatisfied: The response from the model is unsatisfactory, with many issues, and does not meet my needs.</p><p>3 Points. Neutral: The response from the model is acceptable; it answers some questions but still has room for improvement.</p><p>4 Points. Satisfied: The response from the model is satisfactory, and most of the questions have been answered effectively.</p><p>5 Points. Very satisfied: The response from the model is excellent, and all questions have been answered very well.</p></boxed-text></sec><sec id="s2-5"><title>Statistical Analysis</title><sec id="s2-5-1"><title>Primary Outcomes</title><p>Statistical analyses were performed using SPSS Statistics 27 (IBM Corp) and R (version 4.5.0; R Foundation for Statistical Computing). Descriptive statistics were used to summarize expert and patient ratings as means, SDs, and medians. In phase 1, differences among the 4 LLMs were assessed using one-way ANOVA with Tukey Honestly Significant Difference for post hoc comparisons if normality assumptions (tested via the Shapiro-Wilk test) were met. For nonnormally distributed or heteroscedastic data, the Kruskal-Wallis H test and Mann-Whitney <italic>U</italic> test with Bonferroni correction were applied. In phase 2, paired-sample <italic>t</italic> tests were used to compare patient ratings between 2 selected LLMs. To evaluate the consistency of expert ratings, interrater reliability was assessed using Krippendorff &#x03B1; for overall agreement and Cohen &#x03BA; for pairwise agreement within expert subgroups (nursing, clinical, and rehabilitation). These analyses provided insights into potential variations in scoring standards across professional backgrounds.</p></sec><sec id="s2-5-2"><title>Secondary Outcomes</title><p>An objective readability analysis was conducted on the responses generated by the 4 LLMs using a Chinese Readability Assessment Platform. This web-based tool [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>] evaluates text readability by analyzing 52 linguistic features through a multiple linear regression model. The platform provides metrics, including education level, reading difficulty, and recommended reading age, with higher scores indicating more complex text.</p><p>A one-way ANOVA was used to assess differences among the 4 LLMs in terms of word count, reading difficulty scores, and recommended reading age. Post hoc analysis was performed using Tukey Honestly Significant Difference test to examine intermodel differences. Additionally, dot plots were generated using the HIPLOT web-based tool [<xref ref-type="bibr" rid="ref19">19</xref>] to visually present the readability scores for each model. A significance level of &#x03B1;&#x003C;.05 was adopted for all tests.</p></sec></sec><sec id="s2-6"><title>Ethical Considerations</title><p>This study was approved by the medical ethics committee of Shanghai East Hospital (approval no.: 2025YS-042). All participants provided written informed consent prior to participation. Data collected from participants were anonymized to ensure privacy and confidentiality. Participants received a small gift (approximately RMB 30 [US $4.18]) as a token of appreciation for their time and involvement, in accordance with institutional ethics guidelines. No identifiable personal information is included in the manuscript or supplementary materials (<xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>).</p><p><italic>Phase 1</italic>: Common questions from patients with stroke undergoing home-based rehabilitation were collected, and based on the International Stroke Rehabilitation Guidelines and expert input, a questionnaire was developed containing 15 questions and 2 typical cases. These questions and cases were input into 4 LLMs&#x2014;ChatGPT, MedGo, Qwen, and ERNIE Bot&#x2014;with each model receiving the inputs 3 times. The models&#x2019; raw text responses were recorded. Two clinical medicine experts, 2 nursing specialists, and 2 rehabilitation therapists evaluated the responses using a Likert 5-point scale across 5 dimensions: accuracy, completeness, readability, safety, and humanity. This evaluation was conducted in 3 rounds, followed by statistical analysis of the ratings. <italic>Phase 2</italic>: Based on phase 1 results, the top 2 performing LLMs were selected for interaction with 30 patients, who provided satisfaction ratings [<xref ref-type="bibr" rid="ref20">20</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Phase 1: Primary Outcomes</title><p>Six experts evaluated the responses generated by the LLMs across 5 dimensions: accuracy, comprehensiveness, readability, safety, and user-centeredness. <xref ref-type="table" rid="table3">Table 3</xref> shows the scores for each LLM.</p><p>Among the models, ChatGPT-4 achieved the highest scores across all dimensions, with particularly outstanding performance in safety (mean 4.38, SD 0.81) and humanity (mean 4.65, SD 0.65). MedGo performed well in accuracy (mean 4.06, SD 0.78) and completeness (mean 4.06, SD 0.74) but was slightly inferior to ChatGPT-4 in the humanity dimension. Qwen and ERNIE Bot received significantly lower scores than both ChatGPT-4 and MedGo. <xref ref-type="table" rid="table3">Table 3</xref> visually shows the average score and highlights the overall performance trends among the models.</p><p>Descriptive statistics (<xref ref-type="table" rid="table4">Table 4</xref>) revealed significant differences across all five evaluation dimensions: (1) Accuracy: ChatGPT-4 achieved the highest score (mean 4.28, SD 0.84), followed by MedGo (mean 4.06, SD 0.78), while Qwen and ERNIE Bot both scored lower (mean 3.91, SD 0.77; mean 3.91, SD 0.81, respectively). (2) Completeness: ChatGPT-4 again led (mean 4.35, SD 0.75), followed by MedGo (mean 4.06, SD 0.74), with Qwen and ERNIE Bot receiving lower scores. (3) Readability: ChatGPT-4 scored the highest (mean 4.28, SD 0.85), followed by MedGo (mean 4.17, SD 0.81) and Qwen (mean 4.02, SD 0.81), while ERNIE Bot had the lowest score (mean 3.99, SD 0.79). (4) Safety: ChatGPT-4 topped this dimension (mean 4.38, SD 0.81), followed by MedGo (mean 4.23, SD 0.73), Qwen (mean 4.08, SD 0.78), and ERNIE Bot (mean 4.05, SD 0.75). (5) Humanity: ChatGPT-4 achieved the highest score (mean 4.65, SD 0.66), followed by MedGo (mean 4.38, SD 0.73), with Qwen and ERNIE Bot both scoring identically (mean 4.27, SD 0.72; mean 4.27, SD 0.75, respectively).</p><p>Among the 20 scoring files, Krippendorff &#x03B1; values ranged from 0.26 to 0.75. A total of 14 files (14/20, 70%) demonstrated at least fair interrater agreement (&#x03B1;&#x2265;.50). The remaining files (6/20, 30%) fell below the 0.50 threshold, indicating lower reliability. Dimensions such as safety and humanistic care generally showed higher consistency, particularly when evaluated for ChatGPT and MedGo. In contrast, comprehensiveness ratings and responses from Qwen and ERNIE Bot yielded more variability across expert scores. Cohen &#x03BA; coefficients showed substantial variation across expert subgroups: clinical physicians had fair agreement (&#x03BA;=0.28), while nursing experts (&#x03BA;=0.03) and rehabilitation therapists (&#x03BA;=&#x2212;0.13) showed poor or even negative agreement (<xref ref-type="table" rid="table5">Table 5</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Average scores of responses from 4 large language models and statistical significance of intermodel differences (<italic>P</italic> values).</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dimension</td><td align="left" valign="bottom">ChatGPT-4</td><td align="left" valign="bottom">MedGo</td><td align="left" valign="bottom">Qwen</td><td align="left" valign="bottom">ERNIE Bot</td><td align="left" valign="bottom"><italic>P</italic> value (vs MedGo)</td><td align="left" valign="bottom"><italic>P</italic> value (vs Qwen)</td><td align="left" valign="bottom"><italic>P</italic> value (vs ERNIE Bot)</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="char" char="." valign="top">4.28</td><td align="char" char="." valign="top">4.06</td><td align="char" char="." valign="top">3.91</td><td align="char" char="." valign="top">3.91</td><td align="char" char="." valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td><td align="char" char="." valign="top">.005</td><td align="char" char="." valign="top">.005</td></tr><tr><td align="left" valign="top">Completeness</td><td align="char" char="." valign="top">4.35</td><td align="char" char="." valign="top">4.06</td><td align="char" char="." valign="top">3.90</td><td align="char" char="." valign="top">3.96</td><td align="char" char="." valign="top">.03</td><td align="char" char="." valign="top">&#x003C;.001</td><td align="char" char="." valign="top">.002</td></tr><tr><td align="left" valign="top">Readability</td><td align="char" char="." valign="top">4.28</td><td align="char" char="." valign="top">4.17</td><td align="char" char="." valign="top">4.02</td><td align="char" char="." valign="top">3.99</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">.05</td></tr><tr><td align="left" valign="top">Safety</td><td align="char" char="." valign="top">4.38</td><td align="char" char="." valign="top">4.23</td><td align="char" char="." valign="top">4.08</td><td align="char" char="." valign="top">4.05</td><td align="left" valign="top">&#x2014;</td><td align="char" char="." valign="top">.03</td><td align="char" char="." valign="top">.01</td></tr><tr><td align="left" valign="top">Humanity</td><td align="char" char="." valign="top">4.65</td><td align="char" char="." valign="top">4.38</td><td align="char" char="." valign="top">4.27</td><td align="char" char="." valign="top">4.27</td><td align="char" char="." valign="top">.04</td><td align="char" char="." valign="top">.001</td><td align="char" char="." valign="top">.001</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Not available.</p></fn></table-wrap-foot></table-wrap><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Descriptive statistics of expert rating analysis and objective readability analysis (Chat-GPT4 and MedGo).</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" rowspan="2">Dimension</td><td align="left" valign="top" colspan="2">Chat-GPT4</td><td align="left" valign="top" colspan="2">MedGo</td></tr><tr><td align="left" valign="top">Median (IQR)</td><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top">Median (IQR)</td><td align="left" valign="top">Mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">5.0 (2.0-5.0)</td><td align="left" valign="top">4.28 (0.84)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.06 (0.78)</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">5.0 (3.0&#x2010;5.0)</td><td align="left" valign="top">4.35 (0.75)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.06 (0.74)</td></tr><tr><td align="left" valign="top">Readability</td><td align="left" valign="top">5.0 (3.0&#x2010;5.0)</td><td align="left" valign="top">4.28 (0.85)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.17 (0.81)</td></tr><tr><td align="left" valign="top">Safety</td><td align="left" valign="top">5.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.38 (0.81)</td><td align="left" valign="top">4.0 (3.0&#x2010;5.0)</td><td align="left" valign="top">4.23 (0.73)</td></tr><tr><td align="left" valign="top">Humanity</td><td align="left" valign="top">5.0 (3.0&#x2010;5.0)</td><td align="left" valign="top">4.65 (0.65)</td><td align="left" valign="top">5.0 (3.0&#x2010;5.0)</td><td align="left" valign="top">4.27 (0.73)</td></tr><tr><td align="left" valign="top">Chinese character count</td><td align="left" valign="top">1367.00 (916.00&#x2010;1697.00)</td><td align="left" valign="top">1338.35 (236.03)</td><td align="left" valign="top">998.00 (726.00&#x2010;1470.00)</td><td align="left" valign="top">1048.35 (195.26)</td></tr><tr><td align="left" valign="top">Reading difficulty score</td><td align="left" valign="top">12.81 (11.13&#x2010;15.16)</td><td align="left" valign="top">12.88 (0.82)</td><td align="left" valign="top">12.30 (11.21&#x2010;14.52)</td><td align="left" valign="top">12.38 (0.90)</td></tr><tr><td align="left" valign="top">Recommended reading age</td><td align="left" valign="top">13.00 (11.00&#x2010;15.00)</td><td align="left" valign="top">12.82 (0.78)</td><td align="left" valign="top">12.00 (11.00&#x2010;14.00)</td><td align="left" valign="top">12.29 (0.85)</td></tr></tbody></table></table-wrap><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Results of expert consistency analysis (Krippendorff &#x03B1;).</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Dimension</td><td align="left" valign="top">ChatGPT-4</td><td align="left" valign="top">MedGo</td><td align="left" valign="top">Qwen</td><td align="left" valign="top">ERNIE Bot</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.52</td><td align="left" valign="top">0.34</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.52</td><td align="left" valign="top">0.54</td><td align="left" valign="top">0.26</td></tr><tr><td align="left" valign="top">Readability</td><td align="char" char="." valign="top">0.68</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.58</td><td align="char" char="." valign="top">0.54</td></tr><tr><td align="left" valign="top">Safety</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.70</td><td align="char" char="." valign="top">0.59</td><td align="char" char="." valign="top">0.43</td></tr><tr><td align="left" valign="top">Humanity</td><td align="char" char="." valign="top">0.69</td><td align="char" char="." valign="top">0.75</td><td align="char" char="." valign="top">0.71</td><td align="char" char="." valign="top">0.63</td></tr></tbody></table></table-wrap><p><xref ref-type="fig" rid="figure2">Figure 2</xref> illustrates the performance of different LLMs across the 5 evaluation dimensions. <xref ref-type="fig" rid="figure2">Figures 2A-2E</xref> show the score trends for each model across 17 questions in terms of accuracy, completeness, humanity, readability, and safety, respectively. <xref ref-type="fig" rid="figure2">Figure 2F</xref> presents a radar chart comparing the overall normalized performance across all dimensions. ChatGPT-4 exhibited the best overall performance in all dimensions. The line chart shows the variation in average scores, with each line representing the scoring trend of a model across different questions. The radar chart offers a visual representation of each model&#x2019;s performance across the 5 dimensions. Each axis of the radar chart corresponds to an evaluation dimension, with a larger area indicating stronger performance in that dimension.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Line chart of the median scores and radar chart for the 4 large language models. (<bold>A</bold>) Accuracy, (<bold>B</bold>) completeness, (<bold>C</bold>) humanity, (<bold>D</bold>) readability, (<bold>E</bold>) safety, and (<bold>F</bold>) radar chart.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73226_fig02.png"/></fig></sec><sec id="s3-2"><title>Phase 1: Secondary Outcomes</title><p>Descriptive statistics for the objective readability analysis are shown in <xref ref-type="table" rid="table4">Tables 4</xref> and <xref ref-type="table" rid="table6">6</xref> and <xref ref-type="fig" rid="figure3">Figure 3</xref>. Specifically, <xref ref-type="fig" rid="figure3">Figures 3A-3C</xref> display the variations in character count, reading difficulty, and recommended reading age across the 4 LLMs. <xref ref-type="fig" rid="figure3">Figure 3D</xref> shows the distribution of reading difficulty scores, and <xref ref-type="fig" rid="figure3">Figure 3E</xref> presents the proportions of education levels required to understand the responses. Chinese Character Count: ChatGPT-4 generated the highest total character count (22,752 characters) and the highest average word count (mean 1338.35, SD 236.03). In contrast, Qwen produced the shortest text, with a total of 13,481 characters and an average word count (mean 793.00, SD 283.64). Reading Difficulty Score: ChatGPT-4 had the highest average reading difficulty score (12.88), while ERNIE Bot had the lowest (11.92). Recommended Reading Age: ChatGPT-4 also had the highest mean recommended reading age (mean 12.82, SD 0.78 years), whereas ERNIE Bot had the lowest (mean 11.94, SD 1.43 years).</p><p>The results of the one-way ANOVA are shown in <xref ref-type="table" rid="table7">Table 7</xref>: Chinese Character Count: Significant differences were observed in the total text length among the LLMs (<italic>F</italic><sub>3,64</sub>=11.43&#x003E;<italic>F</italic><sub>crit</sub>=2.75; <italic>P</italic>&#x003C;.001), with the most significant difference between ChatGPT-4 and Qwen (<italic>P</italic>&#x003C;.001). Reading Difficulty Score: A significant difference was found in reading difficulty scores among the models (<italic>F</italic><sub>3,64</sub>=3.32&#x003E;<italic>F</italic><sub>crit</sub>=2.75; <italic>P</italic>=.03). Recommended Reading Age: No significant differences were observed in recommended reading age among the models (<italic>F</italic><sub>3,64</sub>=2.48&#x003C;<italic>F</italic><sub>crit</sub>=2.75; <italic>P</italic>=.07).</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Descriptive statistics of expert rating analysis and objective readability analysis (Qwen and ERNIE Bot).</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top" rowspan="2">Dimension</td><td align="left" valign="top" colspan="2">Qwen</td><td align="left" valign="top" colspan="2">ERNIE Bot</td></tr><tr><td align="left" valign="top">Median (IQR)</td><td align="left" valign="top">Mean (SD)</td><td align="left" valign="top">Median (IQR)</td><td align="left" valign="top">Mean (SD)</td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">4.0 (2.0-5.0)</td><td align="left" valign="top">3.91 (0.77)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">3.91 (0.81)</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">3.90 (0.78)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">3.96 (0.78)</td></tr><tr><td align="left" valign="top">Readability</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.02 (0.81)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">3.99 (0.79)</td></tr><tr><td align="left" valign="top">Safety</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.08 (0.80)</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.05 (0.75)</td></tr><tr><td align="left" valign="top">Humanity</td><td align="left" valign="top">4.0 (2.0&#x2010;5.0)</td><td align="left" valign="top">4.27 (0.72)</td><td align="left" valign="top">4.0 (3.0&#x2010;5.0)</td><td align="left" valign="top">4.27 (0.75)</td></tr><tr><td align="left" valign="top">Chinese character count</td><td align="left" valign="top">772.00 (325.00&#x2010;1223.00)</td><td align="left" valign="top">793.00 (283.64)</td><td align="left" valign="top">867.00 (694.00&#x2010;2228.00)</td><td align="left" valign="top">979.12 (361.84)</td></tr><tr><td align="left" valign="top">Reading difficulty score</td><td align="left" valign="top">12.40 (11.26&#x2010;14.01)</td><td align="left" valign="top">12.28 (0.71)</td><td align="left" valign="top">11.95 (10.09&#x2010;14.15)</td><td align="left" valign="top">11.92 (1.09)</td></tr><tr><td align="left" valign="top">Recommended reading age</td><td align="left" valign="top">12.00 (11.00&#x2010;14.00)</td><td align="left" valign="top">12.18 (0.78)</td><td align="left" valign="top">12.00 (10.00&#x2010;14.00)</td><td align="left" valign="top">11.94 (1.43)</td></tr></tbody></table></table-wrap><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Comparative evaluation of large language model (LLM) responses on relevant questions. (<bold>A</bold>) Box plot showing the variation in text length among the 4 LLMs, with a significant difference observed between ChatGPT-4 and Qwen (<italic>P</italic>&#x003C;.001). (<bold>B</bold>) Box plot illustrating the variation in reading difficulty scores among the 4 LLMs. (<bold>C</bold>) Box plot showing the variation in recommended reading age among the 4 LLMs (<italic>P</italic>=.07). (<bold>D</bold>) Density plot displaying the distribution of reading difficulty scores among the models. (<bold>E</bold>) Bar chart showing the distribution of educational levels required to comprehend the responses. <italic>P</italic> values indicate pairwise comparisons of Chinese character count: ChatGPT-4 versus MedGo (<italic>P</italic>=.002), ChatGPT-4 versus Qwen (<italic>P</italic>&#x003C;.001), ChatGPT-4 versus ERNIE Bot (<italic>P</italic>=.02), and MedGo versus Qwen (<italic>P</italic>=.04); and for reading difficulty score: ChatGPT-4 versus MedGo (<italic>P</italic>=.01). All rating data in this study were tested and found to follow a normal or approximately normal distribution.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73226_fig03.png"/></fig><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>One-way ANOVA of objective readability.</p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top">Dimension</td><td align="left" valign="top"><italic>F</italic> test (<italic>df</italic>)</td><td align="left" valign="top"><italic>P</italic> value</td><td align="left" valign="top"><italic>F</italic><sub>crit</sub></td></tr></thead><tbody><tr><td align="left" valign="top">Accuracy</td><td align="left" valign="top">4.93 (3, 404)</td><td align="left" valign="top">.002</td><td align="left" valign="top">2.63</td></tr><tr><td align="left" valign="top">Completeness</td><td align="left" valign="top">7.01 (3, 404)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">2.63</td></tr><tr><td align="left" valign="top">Readability</td><td align="left" valign="top">2.87 (3, 404)</td><td align="left" valign="top">.04</td><td align="left" valign="top">2.63</td></tr><tr><td align="left" valign="top">Safety</td><td align="left" valign="top">4.06 (3, 404)</td><td align="left" valign="top">.007</td><td align="left" valign="top">2.63</td></tr><tr><td align="left" valign="top">Humanity</td><td align="left" valign="top">6.18 (3, 404)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">2.63</td></tr><tr><td align="left" valign="top">Chinese character count</td><td align="left" valign="top">11.43 (3, 64)</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">2.75</td></tr><tr><td align="left" valign="top">Reading difficulty score</td><td align="left" valign="top">3.32 (3, 64)</td><td align="left" valign="top">.03</td><td align="left" valign="top">2.75</td></tr><tr><td align="left" valign="top">Recommended reading age</td><td align="left" valign="top">2.48 (3, 64)</td><td align="left" valign="top">.07</td><td align="left" valign="top">2.75</td></tr></tbody></table></table-wrap></sec><sec id="s3-3"><title>Phase 2: Patient Interaction Results</title><p>A total of 30 eligible patients were recruited, generating 90 questions (<xref ref-type="fig" rid="figure4">Figure 4</xref>). These, along with expert-suggested questions, were categorized into 8 groups: basic definitions and types, causes and risk factors, daily management and care, rehabilitation training, effectiveness and safety, emotional support, complications, and rehabilitation environment and equipment.</p><p>A 2-tailed paired <italic>t</italic> test showed that the average score for ChatGPT-4 (mean 3.34, SD &#x202F;0.64) was slightly higher than that for MedGo (mean 3.04, SD &#x202F;0.78), with a statistically significant difference between the 2 models (<italic>t</italic><sub>89</sub>=&#x202F;2.65; <italic>P</italic>&#x202F;=&#x202F;.01; 95% CI 0.08-0.53).</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>The Sankey diagram illustrates the classification of questions in both phases. On the left, 90 questions posed by 30 patients are shown, while on the right, the 15 integrated questions are displayed.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73226_fig04.png"/></fig></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study evaluated the performance of ChatGPT-4, MedGo, Qwen, and ERNIE Bot in providing health education for patients with stroke undergoing home rehabilitation.</p><p>In phase 1, ChatGPT-4 demonstrated the best overall performance across all dimensions, excelling particularly in humanity and safety. This finding aligns with previous studies [<xref ref-type="bibr" rid="ref21">21</xref>]. MedGo, as a medical-specific model, excelled in accuracy and completeness, underscoring its potential for medical text processing and generation. Qwen and ERNIE Bot received lower scores across all 5 dimensions than ChatGPT-4 and MedGo, indicating a significant performance gap. ChatGPT-4 showed a high and concentrated reading difficulty score distribution, making it well-suited for scenarios that require complex content generation. However, this may present readability challenges for general users. However, although its superiority in response quality was well demonstrated, ChatGPT-4&#x2019;s high total character count and average word count may correlate with delayed response times. Prior studies indicated that LLM response latency increased by 0.8&#x2010;1.2 seconds per 100 Chinese characters on standard graphics processing units, which meant that ChatGPT-4&#x2019;s responses required 10&#x2010;16 seconds to generate, which may impair the user engagement [<xref ref-type="bibr" rid="ref22">22</xref>]. ERNIE Bot and MedGo exhibited lower and more stable readability scores, suggesting that they produce easier-to-read content, making them more suitable for general users or tasks that demand lower reading difficulty. Qwen displayed a wide range of readability scores, reflecting greater variability in reading difficulty but with relatively lower stability than the other models.</p><p>In phase 2 of patient interactions, ChatGPT-4 received higher ratings than MedGo, but overall, the ratings were lower than those given by the expert group. This discrepancy is mainly due to the challenges patients face in evaluating the models, including variations in personal understanding, needs, and the models&#x2019; performance and applicability. This is particularly evident when dealing with complex medical information. As supported by the inverse correlation between text length and satisfaction, ChatGPT-4&#x2019;s detailed explanations and high reading difficulty could exceed the working memory and understanding capacity of the older adult, imposing heavy cognitive burden on them. Moreover, as artificial intelligence (AI) in medical decision-making is still developing, patients tend to be more skeptical of the models&#x2019; accuracy and reliability, often finding their responses unclear. In contrast, experts, with their accumulated knowledge and familiarity with medical terminology, are better equipped to interpret the models&#x2019; medical information, resulting in higher ratings.</p><p>There are significant differences in the areas of focus between patients and experts. Patients tend to prioritize rehabilitation methods, outcomes and safety, emotional support, and equipment-related concerns, reflecting their practical needs and psychological state during rehabilitation. Many patients with stroke are primarily concerned with improving their quality of life through daily management and care. Given the psychological pressures they face during rehabilitation, emotional support is also crucial. In contrast, experts focus on the effectiveness of rehabilitation plans, safety in technical aspects, and the dissemination of theoretical knowledge. As professionals, they are more likely to base treatment and rehabilitation plans on scientific evidence.</p><p>This disparity highlights the communication gap between experts and patients in health care. Patients may struggle to fully understand certain medical terms and treatment approaches, leading to confusion or anxiety about their rehabilitation plans. While experts emphasize treatment outcomes and safety, they must also consider how to effectively communicate this specialized knowledge to patients, fostering a correct understanding of rehabilitation and improving treatment adherence.</p><p>According to standardized prompts, each LLM was asked to provide sources for their responses. ChatGPT-4 did not explicitly cite references, but its answers, based on its extensive training dataset, generally aligned with medical knowledge and clinical practice. In contrast, MedGo provided more detailed medical support, citing specific medical literature, treatment guidelines, and clinical studies. However, the responses from Qwen and ERNIE Bot lacked clear citations of literature and concrete clinical evidence.</p><p>Some responses from the LLMs contained significant errors, which exposed critical safety vulnerabilities. For example, in question 3, which addressed the optimal period for stroke rehabilitation, Qwen incorrectly stated that the chronic phase of stroke begins 3 months after onset. According to various medical guidelines, the chronic phase typically starts 6 months after a stroke, making Qwen&#x2019;s response inconsistent with these guidelines. Temporal misclassification may lead to premature termination of intensive therapy, reducing motor function recovery by 15%&#x2010;22% [<xref ref-type="bibr" rid="ref23">23</xref>]. In question 7, concerning commonly used medications during home-based stroke rehabilitation, ERNIE Bot provided an incorrect answer, mentioning alteplase, a thrombolytic drug that cannot be taken at home and must be administered intravenously in a hospital setting, carrying a 6.7% risk of hemorrhagic complications if implemented [<xref ref-type="bibr" rid="ref24">24</xref>]. The use of alteplase requires professional monitoring and must be administered within 4.5 hours of stroke onset. To improve this issue in future models, pretraining filtration should exclude non&#x2013;hospital-administered medications from training data, while warnings ought to be triggered when responses contain high-risk medical terms. Furthermore, all medication-related queries need to be reviewed by clinicians afterward.</p><p>Analysis revealed that ChatGPT-4 made fewer errors, although it occasionally produced &#x201C;hallucinations&#x201D; due to issues with patient language expression. MedGo demonstrated high accuracy but lacked personalized care. Qwen and ERNIE Bot provided incomplete and vague responses.</p><p>The errors observed in the LLMs could impact patient rehabilitation to some extent. This is primarily because LLMs generate responses based on statistical language models rather than true understanding of the questions. They lack genuine comprehension and reasoning abilities. Their training depends heavily on large volumes of open-text data from the web, which does not guarantee the quality or timeliness of the answers. Medical knowledge is vast and complex, and the capabilities of LLMs vary. General-purpose LLMs struggle with specialized medical language and often lack explainability. These models are particularly prone to errors in areas such as disease diagnosis, drug effects, and emerging medical issues.</p><p>Therefore, when addressing medical questions, especially in health care decision-making, reliance on AI models should be approached with caution. Professional medical judgment remains irreplaceable, particularly when it concerns patient health and treatment plans.</p></sec><sec id="s4-2"><title>Comparison With Prior Work</title><p>A growing body of literature has evaluated the capabilities of LLMs across a wide range of clinical tasks, particularly in areas such as diagnosis support [<xref ref-type="bibr" rid="ref25">25</xref>], test preparation [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>], and medical documentation [<xref ref-type="bibr" rid="ref28">28</xref>]. Most of these traditional LLM comparison studies involved models such as ChatGPT-3.5/4, Claude 2, Bard, and Qwen, and focused on measurable dimensions such as accuracy [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>], specific competencies, and reasoning performance. These assessments often used standardized benchmark questions or national-level medical examinations as proxies for real-world expertise, such as the German Medical State Exam [<xref ref-type="bibr" rid="ref31">31</xref>], the Chinese Nursing Licensing Exam [<xref ref-type="bibr" rid="ref32">32</xref>], or the NEET-2023 in India [<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>In contrast, LLM applications in the domain of stroke and rehabilitation have been relatively limited and primarily focused on acute care or clinical prediction tasks. For example, one recent study used GPT-4 to predict 90-day mortality in ischemic stroke management [<xref ref-type="bibr" rid="ref34">34</xref>] while another investigated the application of ChatGLM-6B in stroke diagnosis, subtype identification, and treatment eligibility screening [<xref ref-type="bibr" rid="ref35">35</xref>]. These studies, although valuable, are oriented toward professional use. They focus on the performance of general-purpose LLMs [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>], and research on medical-specific models remains limited. For instance, some studies have found that Med-PaLM 2 has made significant progress in medical question answering, particularly across multiple medical benchmarks and real-world problem-solving [<xref ref-type="bibr" rid="ref38">38</xref>]. However, this model is not tailored for the Chinese medical context. Our study expands this line of inquiry by targeting a patient-facing use case&#x2014;home-based stroke rehabilitation education&#x2014;which involves not only clinical accuracy but also empathy, comprehensibility, and functional utility in patient self-management.</p><p>Furthermore, while ChatGPT-4 has consistently demonstrated strong performance in prior research [<xref ref-type="bibr" rid="ref39">39</xref>], its superiority is not absolute. In our results, MedGo&#x2014;a model developed for clinical Chinese QA&#x2014;produced more concise and actionable responses in caregiver instruction tasks. This suggests that domain-adapted models may outperform generalist models when the prompt is tightly aligned with their specialization, echoing findings from recent studies in oral health and pediatric triage [<xref ref-type="bibr" rid="ref40">40</xref>].</p><p>Another key distinction is our methodological framework. Many previous studies relied solely on expert judgments or automatic evaluation metrics. In contrast, our design includes both multidisciplinary expert ratings and real patient scoring, offering a dual-perspective validation framework. This approach provides ecological relevance by reflecting both clinical quality and user-perceived usefulness&#x2014;dimensions often overlooked in LLM assessment [<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>Finally, our findings underscore the need for task-specific, longitudinal, and patient-centered evaluation of LLMs. Future research should incorporate more diverse rehabilitation populations, integrate multilingual models (eg, Qwen-2.5Max, Gemini, and Med-Gemini), and assess user trust, emotional alignment, and practical impact over time. The creation and evaluation standards for medical LLMs must be actively developed by the medical community [<xref ref-type="bibr" rid="ref41">41</xref>] and validated through real-world experiments.</p></sec><sec id="s4-3"><title>Limitations</title><p>Although this study provides valuable insights into the application of LLMs in home-based stroke rehabilitation health education, several limitations should be noted. First, the expert sample size was small, with only 6 experts participating in the ratings, which may have affected the comprehensiveness and representativeness of the evaluations. The interrater consistency among experts was limited. This variability may stem from several factors, including differing professional perspectives, variable familiarity with LLM outputs, and inherent subjectivity in evaluating response quality across multiple dimensions. Furthermore, some rating criteria&#x2014;such as &#x201C;empathy&#x201D; or &#x201C;clinical applicability&#x201D;&#x2014;may be interpreted differently by clinicians and nonclinicians. Second, to facilitate patient understanding and streamline the rating process, only a satisfaction scale was used in the second phase of interaction, resulting in a simplified rating criterion. Third, the initial question design did not sufficiently address issues related to patient emotions and adherence. These dimensions were not apparent during phase 1, when questions were primarily developed based on medical platforms and existing literature&#x2014;sources that tend to focus more on clinical procedures than on psychological or behavioral needs. However, during phase 2, through patient interviews and analysis of interaction data, we recognized that emotional fluctuations and treatment adherence play a vital role in the success of home-based rehabilitation. Fourth, the broader implementation of LLMs requires careful consideration of their economic viability, feasibility, and sustainability. While this study focused primarily on the academic perspective, real-world applications must address cost-effectiveness, technological barriers, and the potential impact on health care institutions. Based on comparable systems, deploying LLMs would incur application programming interface and graphics processing unit maintenance costs, while cost savings emerge when the clinic workload is reduced. However, whether breakeven can be approached remains unknown [<xref ref-type="bibr" rid="ref42">42</xref>]. Fifth, model selection was constrained by temporal and practical considerations. Although ChatGPT-4.0, MedGo, Qwen-Max, and ERNIE Bot V3.5 were representative and widely used at the time of study design (January 2025), newer models such as Qwen-2.5Max and Med-Gemini were not included due to release timelines and limited accessibility. Specifically, Qwen-2.5Max was launched after the study protocol was finalized and required a paid subscription for application programming interface access, which may not reflect typical user access in community-based health care. Med-Gemini, although released earlier, had not been widely validated in peer-reviewed Chinese language medical research and remained restricted in practical application. In addition, domestic models such as Doubao (Cici) were excluded due to their lack of domain-specific medical optimization and low representation in scholarly health care literature. Finally, the health education content generated by LLMs still presents potential biases, inconsistent information, and a lack of explainability. Research [<xref ref-type="bibr" rid="ref43">43</xref>] indicates that the use of LLMs in health care faces challenges related to information reliability, biases, ethical compliance, and patient acceptance.</p><p>Future research should focus on the following aspects. First, studies should consider increasing the number of expert raters within each professional subgroup to average out individual bias, standardizing scoring rubrics through iterative training, and using consensus-building techniques such as Delphi methods or calibration rounds prior to formal scoring. It may also be beneficial to incorporate mixed methods triangulation (eg, combining expert scores with patient evaluations or objective metrics) to strengthen the robustness of model performance assessments [<xref ref-type="bibr" rid="ref44">44</xref>]. Second, a unified rating standard should be established when comparing expert and patient ratings to ensure result comparability. Specifically, for personalized home rehabilitation education for patients with stroke, future studies could expand the sample size and include evaluations from diverse patient groups, further exploring the adaptability of LLMs at different stages of rehabilitation. Third, subsequent research should consider participatory design strategies&#x2014;such as structured patient interviews or patient-reported outcome measures&#x2014;to ensure that emotional and motivational aspects are appropriately captured. Fourth, to address potential resource challenges in LLM application, feasibility assessments should be conducted regarding their management, use, and long-term maintenance. Fifth, research should expand the comparative framework to include newer LLMs such as Qwen-2.5Max, Med-Gemini, and international contenders such as Gemini [<xref ref-type="bibr" rid="ref45">45</xref>], with a focus on evaluating their real-world performance in diverse clinical contexts. Longitudinal studies tracking LLM performance across updates may also be warranted to assess consistency, robustness, and scalability of health care&#x2013;oriented outputs over time. Finally, further optimization of model algorithms is needed to improve the reliability of medical knowledge bases, and stronger oversight and regulation of AI-generated health information are essential [<xref ref-type="bibr" rid="ref46">46</xref>].</p></sec><sec id="s4-4"><title>Conclusions</title><p>This 2-phase evaluation demonstrated that LLMs, particularly ChatGPT-4 and MedGo, show considerable promise in supporting home-based stroke rehabilitation education. ChatGPT-4 achieved the highest scores across all expert-evaluated dimensions, excelling in user-centeredness and comprehensiveness, while MedGo, a domain-specific model, produced more concise, evidence-based responses. In contrast, general purpose models such as Qwen and ERNIE Bot performed less consistently across key evaluation criteria. Notably, patients&#x2019; satisfaction ratings were lower than those of experts, highlighting potential usability challenges related to language complexity and trust in automated responses. These findings underscore the importance of aligning LLM-generated content with patient comprehension levels, emotional needs, and health literacy. Future work should focus on the development of hybrid models that integrate the conversational fluency of general purpose LLMs with the domain accuracy of medically trained models. Additional studies involving diverse patient populations, longitudinal designs, and real-world deployment scenarios are warranted to ensure safe, effective, and equitable integration of LLMs into patient-centered rehabilitation care.</p></sec></sec></body><back><ack><p>This study was supported by the following funding sources: the 2022 Pudong New District Health System Leading Talent Cultivation Program (grant PWRd2022-16), the Shanghai Science and Technology Commission Project (grant 24692115300), based at Zhongshan Hospital, Fudan University, the Tongji University School of Nursing Discipline Construction Three-Year Action Plan (2022&#x2010;2025), Leading Talent Project (grant JS2210204), and the Pudong New District Health Commission project &#x201C;Revision of the Quick Aphasia Assessment Tool (QAB) and its Application in Post-Stroke Aphasia Patients&#x201D; (grant PW2022A-0). Funding support was jointly provided by Tongji University and Shanghai East Hospital. During manuscript preparation, ChatGPT-4 (OpenAI) was used to assist in language editing and error checking. Some icons used in <xref ref-type="fig" rid="figure1">Figure 1</xref> were sourced from Bioicons [<xref ref-type="bibr" rid="ref20">20</xref>], which are distributed under the Creative Commons Attribution 4.0 International (CC BY 4.0) license. All icon usage complies with the corresponding licensing requirements.</p></ack><notes><sec><title>Data Availability</title><p>The full data supporting this study&#x2014;including expert scoring data from phase 1, patient questions and scoring records from phase 2, and the ethics approval document&#x2014;can be accessed in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendices 2</xref><xref ref-type="supplementary-material" rid="app3"/>-<xref ref-type="supplementary-material" rid="app4">4</xref>.</p></sec></notes><fn-group><fn fn-type="con"><p>SQ (lead), YL (equal), and ZH (lead) contributed to conceptualization; SQ contributed to data curation; SQ (lead), YG (supporting), and YZ (supporting) performed the formal analysis; HY (lead) and HS (equal) contributed to funding acquisition; YL (lead), YW (equal), and NH (supporting) led the investigation; SQ (lead), HY (supporting), and HZ (supporting) contributed to methodology; YL (lead), YW (supporting), and YZ (supporting) led the resources; YL (lead) and ZX (supporting) performed the validation; SQ (lead) performed the visualization; SQ (lead) and HY (supporting) contributed to writing&#x2014;original draft; and HZ (lead), HY (equal), and HS (equal) contributed to writing&#x2014;review and editing.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feigin</surname><given-names>VL</given-names> </name><name name-style="western"><surname>Brainin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Norrving</surname><given-names>B</given-names> </name><etal/></person-group><article-title>World Stroke Organization (WSO): global stroke fact sheet 2022</article-title><source>Int J Stroke</source><year>2022</year><month>01</month><volume>17</volume><issue>1</issue><fpage>18</fpage><lpage>29</lpage><pub-id pub-id-type="doi">10.1177/17474930211065917</pub-id><pub-id pub-id-type="medline">34986727</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Markus</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Brainin</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fisher</surname><given-names>M</given-names> </name></person-group><article-title>Tracking the global burden of stoke and dementia: World Stroke Day 2020</article-title><source>Int J Stroke</source><year>2020</year><month>10</month><volume>15</volume><issue>8</issue><fpage>817</fpage><lpage>818</lpage><pub-id pub-id-type="doi">10.1177/1747493020959186</pub-id><pub-id pub-id-type="medline">33115386</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bartoli</surname><given-names>D</given-names> </name><name name-style="western"><surname>Petrizzo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Vellone</surname><given-names>E</given-names> </name><name name-style="western"><surname>Alvaro</surname><given-names>R</given-names> </name><name name-style="western"><surname>Pucciarelli</surname><given-names>G</given-names> </name></person-group><article-title>Impact of telehealth on stroke survivor-caregiver dyad in at-home rehabilitation: a systematic review</article-title><source>J Adv Nurs</source><year>2024</year><month>10</month><volume>80</volume><issue>10</issue><fpage>4003</fpage><lpage>4033</lpage><pub-id pub-id-type="doi">10.1111/jan.16177</pub-id><pub-id pub-id-type="medline">38563582</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey on evaluation of large language models</article-title><source>ACM Trans Intell Syst Technol</source><year>2024</year><month>06</month><day>30</day><volume>15</volume><issue>3</issue><fpage>1</fpage><lpage>45</lpage><pub-id pub-id-type="doi">10.1145/3641289</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Denecke</surname><given-names>K</given-names> </name><name name-style="western"><surname>May</surname><given-names>R</given-names> </name><name name-style="western"><surname>Rivera Romero</surname><given-names>O</given-names> </name><collab>LLMHealthGroup</collab></person-group><article-title>Potential of large language models in health care: Delphi study</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>13</day><volume>26</volume><fpage>e52399</fpage><pub-id pub-id-type="doi">10.2196/52399</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wilhelm</surname><given-names>TI</given-names> </name><name name-style="western"><surname>Roos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name></person-group><article-title>Large language models for therapy recommendations across 3 clinical specialties: comparative study</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>30</day><volume>25</volume><fpage>e49324</fpage><pub-id pub-id-type="doi">10.2196/49324</pub-id><pub-id pub-id-type="medline">37902826</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lv</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>H</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>J</given-names> </name></person-group><article-title>Leveraging large language models for improved patient access and self-management: assessor-blinded comparison between expert- and AI-generated content</article-title><source>J Med Internet Res</source><year>2024</year><month>04</month><day>25</day><volume>26</volume><fpage>e55847</fpage><pub-id pub-id-type="doi">10.2196/55847</pub-id><pub-id pub-id-type="medline">38663010</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>An</surname><given-names>B</given-names> </name></person-group><article-title>MedGo: a Chinese medical large language model</article-title><source>arXiv</source><comment>Preprint posted online on 2024</comment><comment>arXiv:2410.20428</comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>E</given-names> </name><name name-style="western"><surname>McDougal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lytton</surname><given-names>W</given-names> </name></person-group><article-title>Large language model (GPT-4) accurately localizes stroke lesions (P8-4.002)</article-title><source>Neurology (ECronicon)</source><year>2024</year><month>04</month><day>9</day><volume>102</volume><issue>7 Suppl 1</issue><fpage>2563</fpage><pub-id pub-id-type="doi">10.1212/WNL.0000000000204601</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lehnen</surname><given-names>NC</given-names> </name><name name-style="western"><surname>Dorn</surname><given-names>F</given-names> </name><name name-style="western"><surname>Wiest</surname><given-names>IC</given-names> </name><etal/></person-group><article-title>Data extraction from free-text reports on mechanical thrombectomy in acute ischemic stroke using ChatGPT: a retrospective analysis</article-title><source>Radiology</source><year>2024</year><month>04</month><volume>311</volume><issue>1</issue><fpage>e232741</fpage><pub-id pub-id-type="doi">10.1148/radiol.232741</pub-id><pub-id pub-id-type="medline">38625006</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Neo</surname><given-names>JRE</given-names> </name><name name-style="western"><surname>Ser</surname><given-names>JS</given-names> </name><name name-style="western"><surname>Tay</surname><given-names>SS</given-names> </name></person-group><article-title>Use of large language model-based chatbots in managing the rehabilitation concerns and education needs of outpatient stroke survivors and caregivers</article-title><source>Front Digit Health</source><year>2024</year><volume>6</volume><fpage>1395501</fpage><pub-id pub-id-type="doi">10.3389/fdgth.2024.1395501</pub-id><pub-id pub-id-type="medline">38784703</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Fernandez</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Schwartz</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Comparing GPT-4 and human researchers in health care data analysis: qualitative description study</article-title><source>J Med Internet Res</source><year>2024</year><month>08</month><day>21</day><volume>26</volume><fpage>e56500</fpage><pub-id pub-id-type="doi">10.2196/56500</pub-id><pub-id pub-id-type="medline">39167785</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deiner</surname><given-names>MS</given-names> </name><name name-style="western"><surname>Deiner</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Hristidis</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Use of large language models to assess the likelihood of epidemics from the content of tweets: infodemiology study</article-title><source>J Med Internet Res</source><year>2024</year><month>03</month><day>1</day><volume>26</volume><fpage>e49139</fpage><pub-id pub-id-type="doi">10.2196/49139</pub-id><pub-id pub-id-type="medline">38427404</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klijn</surname><given-names>CJM</given-names> </name><name name-style="western"><surname>Hankey</surname><given-names>GJ</given-names> </name><collab>American Stroke Association and European Stroke Initiative</collab></person-group><article-title>Management of acute ischaemic stroke: new guidelines from the American Stroke Association and European Stroke Initiative</article-title><source>Lancet Neurol</source><year>2003</year><month>11</month><volume>2</volume><issue>11</issue><fpage>698</fpage><lpage>701</lpage><pub-id pub-id-type="doi">10.1016/s1474-4422(03)00558-1</pub-id><pub-id pub-id-type="medline">14572738</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Benchmarking four large language models&#x2019; performance of addressing Chinese patients&#x2019; inquiries about dry eye disease: a two-phase study</article-title><source>Heliyon</source><year>2024</year><month>07</month><volume>10</volume><issue>14</issue><fpage>e34391</fpage><pub-id pub-id-type="doi">10.1016/j.heliyon.2024.e34391</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sezgin</surname><given-names>E</given-names> </name><name name-style="western"><surname>Chekeni</surname><given-names>F</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Keim</surname><given-names>S</given-names> </name></person-group><article-title>Clinical accuracy of large language models and Google search responses to postpartum depression questions: cross-sectional study</article-title><source>J Med Internet Res</source><year>2023</year><month>09</month><day>11</day><volume>25</volume><fpage>e49240</fpage><pub-id pub-id-type="doi">10.2196/49240</pub-id><pub-id pub-id-type="medline">37695668</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Dong</surname><given-names>J</given-names> </name></person-group><article-title>On key factors of text reading difficulty grading and readability formula based on Chinese textbook corpus</article-title><source>Yuyan Wenzi Yingyong (Appl Linguist Stud)</source><year>2020</year><issue>1</issue><fpage>132</fpage><lpage>143</lpage></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>Online text readability analysis tool</article-title><source>Chinese Resource Platform</source><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="http://120.27.70.114:8000/analysis_a">http://120.27.70.114:8000/analysis_a</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><article-title>Online tool for data visualization and statistical analysis</article-title><source>HIPLOT</source><year>2022</year><month>07</month><day>18</day><access-date>2025-01-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://hiplot.org">https://hiplot.org</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>Free open-source icons for scientific use</article-title><source>Bioicons</source><year>2024</year><month>04</month><access-date>2025-01-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://bioicons.com/">https://bioicons.com/</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cho</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Leveraging large language models for improved understanding of communications with patients with cancer in a call center setting: proof-of-concept study</article-title><source>J Med Internet Res</source><year>2024</year><month>12</month><day>11</day><volume>26</volume><fpage>e63892</fpage><pub-id pub-id-type="doi">10.2196/63892</pub-id><pub-id pub-id-type="medline">39661975</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hom</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model influence on diagnostic reasoning: a randomized clinical trial</article-title><source>JAMA Netw Open</source><year>2024</year><month>10</month><day>1</day><volume>7</volume><issue>10</issue><fpage>e2440969</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.40969</pub-id><pub-id pub-id-type="medline">39466245</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>DeLuca</surname><given-names>M</given-names> </name><name name-style="western"><surname>Low</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kumari</surname><given-names>V</given-names> </name><name name-style="western"><surname>Parton</surname><given-names>A</given-names> </name><name name-style="western"><surname>Davis</surname><given-names>J</given-names> </name><name name-style="western"><surname>Mohagheghi</surname><given-names>AA</given-names> </name></person-group><article-title>A systematic review with meta-analysis of the StartReact effect on motor responses in stroke survivors and healthy individuals</article-title><source>J Neurophysiol</source><year>2022</year><month>04</month><day>1</day><volume>127</volume><issue>4</issue><fpage>938</fpage><lpage>945</lpage><pub-id pub-id-type="doi">10.1152/jn.00392.2021</pub-id><pub-id pub-id-type="medline">35235444</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beyranvand</surname><given-names>MR</given-names> </name><name name-style="western"><surname>Asadpour Piranfar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Solaymani-Dodaran</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Assessment of reperfusion efficacy of Altelyse versus Actilyse in patients with acute myocardial infarction: a phase 3, randomized, double-blinded, non-inferiority clinical trial</article-title><source>Clin Drug Investig</source><year>2025</year><month>02</month><volume>45</volume><issue>2</issue><fpage>101</fpage><lpage>110</lpage><pub-id pub-id-type="doi">10.1007/s40261-025-01420-3</pub-id><pub-id pub-id-type="medline">39873854</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chiu</surname><given-names>WHK</given-names> </name><name name-style="western"><surname>Ko</surname><given-names>WSK</given-names> </name><name name-style="western"><surname>Cho</surname><given-names>WCS</given-names> </name><name name-style="western"><surname>Hui</surname><given-names>SYJ</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>WCL</given-names> </name><name name-style="western"><surname>Kuo</surname><given-names>MD</given-names> </name></person-group><article-title>Evaluating the diagnostic performance of large language models on complex multimodal medical cases</article-title><source>J Med Internet Res</source><year>2024</year><month>05</month><day>13</day><volume>26</volume><fpage>e53724</fpage><pub-id pub-id-type="doi">10.2196/53724</pub-id><pub-id pub-id-type="medline">38739441</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alqahtani</surname><given-names>T</given-names> </name><name name-style="western"><surname>Badreldin</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Alrashed</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The emergent role of artificial intelligence, natural learning processing, and large language models in higher education and research</article-title><source>Res Social Adm Pharm</source><year>2023</year><month>08</month><volume>19</volume><issue>8</issue><fpage>1236</fpage><lpage>1242</lpage><pub-id pub-id-type="doi">10.1016/j.sapharm.2023.05.016</pub-id><pub-id pub-id-type="medline">37321925</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kung</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Cheatham</surname><given-names>M</given-names> </name><name name-style="western"><surname>Medenilla</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT on USMLE: potential for AI-assisted medical education using large language models</article-title><source>PLoS Digit Health</source><year>2023</year><month>02</month><volume>2</volume><issue>2</issue><fpage>e0000198</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000198</pub-id><pub-id pub-id-type="medline">36812645</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>B</given-names> </name></person-group><article-title>Large language models in summarizing radiology report impressions for lung cancer in Chinese: evaluation study</article-title><source>J Med Internet Res</source><year>2025</year><month>04</month><day>3</day><volume>27</volume><fpage>e65547</fpage><pub-id pub-id-type="doi">10.2196/65547</pub-id><pub-id pub-id-type="medline">40179389</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bedi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Orr-Ewing</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Testing and evaluation of health care applications of large language models: a systematic review</article-title><source>JAMA</source><year>2025</year><month>01</month><day>28</day><volume>333</volume><issue>4</issue><fpage>319</fpage><lpage>328</lpage><pub-id pub-id-type="doi">10.1001/jama.2024.21700</pub-id><pub-id pub-id-type="medline">39405325</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kasneci</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sessler</surname><given-names>K</given-names> </name><name name-style="western"><surname>K&#x00FC;chemann</surname><given-names>S</given-names> </name><etal/></person-group><article-title>ChatGPT for good? On opportunities and challenges of large language models for education</article-title><source>Learn Individ Differ</source><year>2023</year><month>04</month><volume>103</volume><fpage>102274</fpage><pub-id pub-id-type="doi">10.1016/j.lindif.2023.102274</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kasapovic</surname><given-names>A</given-names> </name><name name-style="western"><surname>Jansen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name></person-group><article-title>Artificial intelligence in medical education: comparative analysis of ChatGPT, Bing, and medical students in Germany</article-title><source>JMIR Med Educ</source><year>2023</year><month>09</month><day>4</day><volume>9</volume><fpage>e46482</fpage><pub-id pub-id-type="doi">10.2196/46482</pub-id><pub-id pub-id-type="medline">37665620</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>F</given-names> </name></person-group><article-title>Qwen-2.5 outperforms other large language models in the Chinese national nursing licensing examination: retrospective cross-sectional comparative study</article-title><source>JMIR Med Inform</source><year>2025</year><month>01</month><day>10</day><volume>13</volume><fpage>e63731</fpage><pub-id pub-id-type="doi">10.2196/63731</pub-id><pub-id pub-id-type="medline">39793017</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Farhat</surname><given-names>F</given-names> </name><name name-style="western"><surname>Chaudhry</surname><given-names>BM</given-names> </name><name name-style="western"><surname>Nadeem</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sohail</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Madsen</surname><given-names>D&#x00D8;</given-names> </name></person-group><article-title>Evaluating large language models for the national premedical exam in India: comparative analysis of GPT-3.5, GPT-4, and Bard</article-title><source>JMIR Med Educ</source><year>2024</year><month>02</month><day>21</day><volume>10</volume><fpage>e51523</fpage><pub-id pub-id-type="doi">10.2196/51523</pub-id><pub-id pub-id-type="medline">38381486</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shmilovitch</surname><given-names>AH</given-names> </name><name name-style="western"><surname>Katson</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cohen-Shelly</surname><given-names>M</given-names> </name><name name-style="western"><surname>Peretz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Aran</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shelly</surname><given-names>S</given-names> </name></person-group><article-title>GPT-4 as a clinical decision support tool in ischemic stroke management: evaluation study</article-title><source>JMIR AI</source><year>2025</year><month>03</month><day>7</day><volume>4</volume><fpage>e60391</fpage><pub-id pub-id-type="doi">10.2196/60391</pub-id><pub-id pub-id-type="medline">40053715</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Song</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>He</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>W</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name></person-group><article-title>Stroke diagnosis and prediction tool using ChatGLM: development and validation study</article-title><source>J Med Internet Res</source><year>2025</year><month>02</month><day>26</day><volume>27</volume><fpage>e67010</fpage><pub-id pub-id-type="doi">10.2196/67010</pub-id><pub-id pub-id-type="medline">40009850</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>R&#x00ED;os-Hoyo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Shan</surname><given-names>NL</given-names> </name><name name-style="western"><surname>Li</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pearson</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Pusztai</surname><given-names>L</given-names> </name><name name-style="western"><surname>Howard</surname><given-names>FM</given-names> </name></person-group><article-title>Evaluation of large language models as a diagnostic aid for complex medical cases</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1380148</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1380148</pub-id><pub-id pub-id-type="medline">38966538</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Oh</surname><given-names>N</given-names> </name><name name-style="western"><surname>Choi</surname><given-names>GS</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>WY</given-names> </name></person-group><article-title>ChatGPT goes to the operating room: evaluating GPT-4 performance and its potential in surgical education and training in the era of large language models</article-title><source>Ann Surg Treat Res</source><year>2023</year><month>05</month><volume>104</volume><issue>5</issue><fpage>269</fpage><lpage>273</lpage><pub-id pub-id-type="doi">10.4174/astr.2023.104.5.269</pub-id><pub-id pub-id-type="medline">37179699</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7</pub-id><pub-id pub-id-type="medline">39779926</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kianian</surname><given-names>R</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>D</given-names> </name><name name-style="western"><surname>Rojas-Carabali</surname><given-names>W</given-names> </name><name name-style="western"><surname>Agrawal</surname><given-names>R</given-names> </name><name name-style="western"><surname>Tsui</surname><given-names>E</given-names> </name></person-group><article-title>Large language models may help patients understand peer-reviewed scientific articles about ophthalmology: development and usability study</article-title><source>J Med Internet Res</source><year>2024</year><month>12</month><day>24</day><volume>26</volume><fpage>e59843</fpage><pub-id pub-id-type="doi">10.2196/59843</pub-id><pub-id pub-id-type="medline">39719077</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>You</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Evaluating and enhancing large language models&#x2019; performance in domain-specific medicine: development and usability study with DocOA</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>22</day><volume>26</volume><fpage>e58158</fpage><pub-id pub-id-type="doi">10.2196/58158</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shah</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Entwistle</surname><given-names>D</given-names> </name><name name-style="western"><surname>Pfeffer</surname><given-names>MA</given-names> </name></person-group><article-title>Creation and adoption of large language models in medicine</article-title><source>JAMA</source><year>2023</year><month>09</month><day>5</day><volume>330</volume><issue>9</issue><fpage>866</fpage><lpage>869</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.14217</pub-id><pub-id pub-id-type="medline">37548965</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waldock</surname><given-names>WJ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nabeel</surname><given-names>A</given-names> </name><name name-style="western"><surname>Darzi</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ashrafian</surname><given-names>H</given-names> </name></person-group><article-title>The accuracy and capability of artificial intelligence solutions in health care examinations and certificates: systematic review and meta-analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>5</day><volume>26</volume><fpage>e56532</fpage><pub-id pub-id-type="doi">10.2196/56532</pub-id><pub-id pub-id-type="medline">39499913</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Wan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ni</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Applications and concerns of ChatGPT and other conversational large language models in health care: systematic review</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>7</day><volume>26</volume><fpage>e22769</fpage><pub-id pub-id-type="doi">10.2196/22769</pub-id><pub-id pub-id-type="medline">39509695</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Bhagat</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Shihab</surname><given-names>IF</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy is not agreement: expert-aligned evaluation of crash narrative classification models</article-title><source>arXiv</source><comment>Preprint posted online on 2025</comment><comment>arXiv:2504.13068</comment><pub-id pub-id-type="doi">10.48550/arXiv.2504.13068</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>The impact of multimodal large language models on health care&#x2019;s future</article-title><source>J Med Internet Res</source><year>2023</year><month>11</month><day>2</day><volume>25</volume><fpage>e52865</fpage><pub-id pub-id-type="doi">10.2196/52865</pub-id><pub-id pub-id-type="medline">37917126</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Minssen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Vayena</surname><given-names>E</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>IG</given-names> </name></person-group><article-title>The challenges for regulating medical use of ChatGPT and other large language models</article-title><source>JAMA</source><year>2023</year><month>07</month><day>25</day><volume>330</volume><issue>4</issue><fpage>315</fpage><lpage>316</lpage><comment>Erratum in: JAMA. 2023 Sep 12;330(10):974. doi: 10.1001/jama.2023.16286</comment><pub-id pub-id-type="doi">10.1001/jama.2023.9651</pub-id><pub-id pub-id-type="medline">37410482</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Phase 1 questionnaire design and the detailed definitions of the Likert 5-point rating scale.</p><media xlink:href="jmir_v27i1e73226_app1.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Phase 1 expert scoring data.</p><media xlink:href="jmir_v27i1e73226_app2.xlsx" xlink:title="XLSX File, 19 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Phase 2 patient questions and scoring data.</p><media xlink:href="jmir_v27i1e73226_app3.xlsx" xlink:title="XLSX File, 21 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Ethics committee approval document.</p><media xlink:href="jmir_v27i1e73226_app4.pdf" xlink:title="PDF File, 1336 KB"/></supplementary-material></app-group></back></article>