<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e92315</article-id><article-id pub-id-type="doi">10.2196/92315</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of DeepSeek V3.2 and ChatGPT 5.1 in Musculoskeletal Triage and Differential Diagnosis of Outpatients With Low Back Pain: Multidimensional Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ma</surname><given-names>Ziqian</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Chen</surname><given-names>Ruiyuan</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Aobo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Xi</surname><given-names>Yu</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liang</surname><given-names>Minghui</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yuan</surname><given-names>Shuo</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Fan</surname><given-names>Ning</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zang</surname><given-names>Jianwei</given-names></name><degrees>MM</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Tianyi</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Zang</surname><given-names>Lei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Orthopedics, Beijing Chao-yang Hospital, Capital Medical University</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>School of Kinesiology and Health, Capital University of Physical Education and Sports</institution><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Coristine</surname><given-names>Andrew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Zhang</surname><given-names>Jun</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Lai</surname><given-names>Xiangxun</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Lei Zang, MD, Department of Orthopedics, Beijing Chao-yang Hospital, Capital Medical University, Beijing, 100043, China, 151718688; <email>zanglei@ccmu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>3</day><month>7</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e92315</elocation-id><history><date date-type="received"><day>28</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>09</day><month>06</month><year>2026</year></date><date date-type="accepted"><day>10</day><month>06</month><year>2026</year></date></history><copyright-statement>&#x00A9; Ziqian Ma, Ruiyuan Chen, Aobo Wang, Yu Xi, Minghui Liang, Shuo Yuan, Ning Fan, Jianwei Zang, Tianyi Wang, Lei Zang. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 3.7.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e92315"/><abstract><sec><title>Background</title><p>Outpatients presenting with low back pain (LBP) often require efficient preconsultation triage and early differential diagnostic support. Large language models may assist these text-based tasks, but their performance under different clinical information conditions remains unclear.</p></sec><sec><title>Objective</title><p>This study aimed to compare the performance of ChatGPT (5.1; OpenAI) and DeepSeek (V3.2; DeepSeek AI) in musculoskeletal disorders (MSDs) triage and the differential diagnosis of outpatients with LBP using real-world outpatient records under 2 simulated information conditions.</p></sec><sec sec-type="methods"><title>Methods</title><p>This retrospective comparative study was conducted at a tertiary academic teaching hospital in Beijing. A total of 160 cases were included using a balanced design across 8 diagnostic categories (20 per category); 6 MSDs and 2 non-MSDs. Evaluation was performed in 2 phases: Phase 1 (chief complaint) and Phase 2 (structured questionnaire with 7 domains or 33 items), both executed in a zero-shot setting using standardized prompts. Outcomes included (1) triage accuracy, (2) preliminary diagnosis accuracy, and (3) differential diagnosis agreement. In Phase 2, 3 senior orthopedic evaluators additionally rated model rationales across 5 domains using a 5-point Likert scale.</p></sec><sec sec-type="results"><title>Results</title><p>For triage accuracy across all 160 cases, DeepSeek V3.2 improved from 84.4% to 90.6% (risk difference [RD] 6.2%, 95% CI &#x2013;0.7% to 13.3%), and ChatGPT 5.1 improved from 75.6% to 93.1% (RD 17.5%, 95% CI 10.2%-24.9%). For preliminary diagnosis accuracy across the 120 musculoskeletal cases, DeepSeek V3.2 improved from 48.3% to 76.7% (RD 28.3%, 95% CI 16.8%-38.8%), whereas ChatGPT 5.1 improved from 35.0% to 87.5% (RD 52.5%, 95% CI 42.8%-60.6%). The mean number of correct differential diagnoses increased from 1.27 (SD 0.71) to 2.02 (SD 0.74) for DeepSeek V3.2 and from 1.34 (SD 0.70) to 2.03 (SD 0.77) for ChatGPT 5.1. In Phase 2, rationale ratings were generally good for both models, with ChatGPT 5.1 scoring higher in understanding and reasoning. Recognition of multiple myeloma (MM) remained limited, improving only from 45% to 55% (DeepSeek V3.2) and 55% to 60% (ChatGPT 5.1). Structured input reduced safety-risk errors in both models, but residual errors remained, especially for MM and metastatic spinal tumor.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Both ChatGPT 5.1 and DeepSeek V3.2 demonstrated potential in text-based triage and differential diagnosis of MSDs for LBP, with structured clinical information generally improving performance, particularly for preliminary diagnosis accuracy and differential diagnosis agreement. However, their suboptimal sensitivity for red-flag conditions such as MM highlights significant safety concerns, indicating that they should not be used as stand-alone triage tools without clinician oversight. ChatGPT 5.1 showed stronger reasoning with structured inputs based on rationale ratings, whereas DeepSeek V3.2 showed better performance under chief-complaint-only input, with significantly higher Phase 1 preliminary diagnostic accuracy and numerically higher Phase 1 triage accuracy. These findings underscore the need for further model refinement, rigorous prospective validation, and integration with clinician oversight before clinical implementation.</p></sec></abstract><kwd-group><kwd>low back pain</kwd><kwd>musculoskeletal disorders</kwd><kwd>triage</kwd><kwd>differential diagnosis</kwd><kwd>large language model</kwd><kwd>ChatGPT</kwd><kwd>DeepSeek</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Musculoskeletal disorders (MSDs) represent a heterogeneous group of pathological conditions affecting the osseous, articular, and soft tissue structures [<xref ref-type="bibr" rid="ref1">1</xref>]. These disorders are clinically characterized by chronic pain, functional impairment, and progressive anatomical degeneration. Over decades, the global prevalence of MSDs has exhibited a sustained upward trajectory, paralleling demographic shifts toward an aging population [<xref ref-type="bibr" rid="ref2">2</xref>]. Epidemiological data reveal a 21.71% increase in the incidence of MSDs in the United States between 2000 and 2021 [<xref ref-type="bibr" rid="ref3">3</xref>], whereas the UK&#x2019;s National Health Service has allocated approximately &#x00A3;6.3 billion (US $7.78 billion; &#x00A3;1=US $1.2344 as of March 31, 2023) to MSD management from 2022 to 2023 [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref4">4</xref>], underscoring their substantial health and economic burden.</p><p>The clinical management of MSDs is further complicated by their overlapping symptomatology, multifactorial etiopathogenesis, and frequent comorbidities, all of which contribute to diagnostic ambiguity, which further promotes suboptimal resource use and compromised patient outcomes [<xref ref-type="bibr" rid="ref5">5</xref>]. Consequently, an effective preconsultation triage system that predicts disease likelihood and directs patients to the appropriate specialty is essential [<xref ref-type="bibr" rid="ref6">6</xref>] for reducing unnecessary visits and improving resource allocation and care efficiency [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref8">8</xref>]. The next step, namely establishing a definitive diagnosis, constitutes an even more complex and multifaceted process. It requires clinicians to draw upon their medical expertise and clinical acumen while concurrently integrating diverse factors and synthesizing a substantial volume of patient data under the overburdened health care systems, which presents a substantial challenge to outpatient services [<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>The development of new artificial intelligence (AI) systems, such as large language models (LLMs), has considerably improved the quality of automated analysis of large and complex data sets [<xref ref-type="bibr" rid="ref10">10</xref>]. LLMs are typically trained on vast open-source corpora spanning diverse domains, which enable them to generate human-like responses to user prompts with remarkable flexibility [<xref ref-type="bibr" rid="ref11">11</xref>]. Owing to their versatility, these chatbot systems have rapidly attracted attention in the medical field. Moreover, such systems are expected to shift the traditional approach to medical information retrieval from static, manual searches to a more dynamic, AI-assisted model of knowledge acquisition [<xref ref-type="bibr" rid="ref12">12</xref>]. However, LLMs also come with some drawbacks, such as misunderstanding of the prompt, lack of self-awareness, fabrication, falsification, or plagiarism [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Previous studies have demonstrated the potential of LLMs to assist in tasks including medical licensing examinations, structured clinical reasoning, health information, and clinical vignettes [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. However, their application in real-world, open-ended clinical scenarios remains an emerging area of investigation. Recent research has predominantly focused on the diagnostic performance of LLMs for specific diseases [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>]. However, they fall short in addressing the complexity of MSDs. Patient-reported symptoms, such as low back pain (LBP) or leg pain, are often nonspecific and broad, frequently involving multiple specialties. The diagnostic capabilities of LLMs in these unstructured, real-world contexts require further systematic evaluation.</p><p>This study aimed to evaluate whether LLM-based chatbots can provide patients and outpatient physicians with comprehensible suggestions for more timely and accurate preliminary diagnosis and triage of MSDs, focusing on a common symptom, namely LBP. The diagnostic capabilities and limitations of 2 state-of-the-art AI chatbots, ChatGPT (5.1; OpenAI) and DeepSeek (V3.2; DeepSeek AI), were then assessed multidimensionally using standardized questionnaires derived from real outpatient records to highlight their potential utility in assisting with clinical diagnosis and triage.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Study Design</title><p>This retrospective comparative study was conducted at our center, a tertiary academic teaching hospital in Beijing, and enrolled outpatients presenting with LBP (<xref ref-type="fig" rid="figure1">Figure 1</xref>). The study assessed the performance of ChatGPT 5.1 and DeepSeek V3.2 in MSD triage and differential diagnosis through a multidimensional evaluation of their clinical reasoning. Differential diagnosis was defined as the formulation of potential conditions explaining a patient&#x2019;s symptoms based on information typically available at the initial consultation, including their medical history and physical examination findings. This study comprised 2 phases. Phase 1 (chief complaints phase) assessed the ability of LLMs to classify MSDs and propose diagnostic and differential diagnostic considerations from a brief clinical complaint. Phase 2 (structured questionnaire phase) incorporated structured clinical data, including general condition, symptom characteristics, and focused physical examination findings, into the LLMs via a structured questionnaire. In Phase 2, expert evaluators assessed the rationales of responses across 5 domains, namely relevance, understanding and reasoning, groundedness, trust and satisfaction, and harm. All case data, including clinical history, examination records, and radiology report descriptions, were derived from Chinese clinical records at our center and subsequently translated and standardized in English for LLM evaluation. The Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis&#x2013;LLM (TRIPOD-LLM) reporting guideline was followed to address the unique challenges of LLMs in biomedical and health care applications. A completed TRIPOD-LLM checklist is presented in <xref ref-type="supplementary-material" rid="app14">Checklist 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the overall study design. AS: ankylosing spondylitis; IDS: infectious diseases of the spine; LDH: lumbar disc herniation; LSS: lumbar spinal stenosis; MSD: musculoskeletal disorder; MST: metastatic spinal tumor; OVCF: osteoporotic vertebral compression fracture.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92315_fig01.png"/></fig></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study was conducted in accordance with the ethical principles stated in the Declaration of Helsinki and was approved by the institutional ethics committee of Beijing Chao-yang Hospital (approval number: 2025-KE-417). Because this was a retrospective secondary analysis of existing clinical records and involved minimal risk to participants, the requirement for additional informed consent was waived by the ethics committee. All patient data were anonymized and deidentified before analysis by removing personal identifiers and replacing them with unique study codes. The study data were used only for research purposes, and confidentiality was maintained throughout data extraction, model evaluation, and statistical analysis. No participant compensation was provided because this study did not involve prospective recruitment or direct participant contact. No identifiable participant information or identifiable images are included in the manuscript or supplementary materials.</p></sec><sec id="s2-3"><title>Population Selection</title><p>Patients who visited the orthopedic outpatient clinic between November 1, 2024, and December 31, 2024, were retrospectively analyzed. The inclusion criteria were as follows: (1) patients presenting with LBP as the primary symptom during their first visit; (2) availability of complete outpatient records, including symptom descriptions, physical examination findings, and general patient information; and (3) subsequent hospitalization or further outpatient investigations leading to a definitive diagnosis related to the chief complaint. A total of 455 medical records were initially screened. All patient data were anonymized by removing personal identifiers and replacing them with unique study codes. This process was independently conducted by a dedicated researcher who was not involved in the subsequent study. Two orthopedic surgeons (ZM and ML) with over 10 years of clinical experience independently reviewed each selected case using a standardized review protocol. For every case, each surgeon generated a ranked list of preliminary diagnoses and 3 plausible differential diagnoses (primary, secondary, and tertiary) and documented the key supporting clinical or imaging cues used for the judgment. In this study, the preliminary diagnosis was defined as the expert-adjudicated dominant cause of the index LBP presentation because this principal diagnosis most directly determines subsequent diagnostic workup and management. After independent annotation, the 2 lists were compared. Any discrepancy in the top diagnosis or in the composition of the 3-diagnosis set triggered an adjudication step, which involved joint reevaluation of a case in a structured consensus meeting, during which the rationale for each candidate diagnosis was explicitly discussed against the case information until agreement was reached. If consensus could not be achieved initially, a second round of review was performed after reassessment of the source records to ensure that the same information was available to both reviewers. The final decisions were regarded as the expert panel&#x2019;s opinions. Preadjudication agreement was quantified using Cohen kappa (&#x03BA;) for 3 endpoints: (1) MSD versus non-MSD identification (all 455 records), (2) diagnosis agreement, and (3) differential diagnosis agreement. For diagnosis-related endpoints, kappa was calculated among records for which both reviewers provided the corresponding labels; denominators and operational definitions are reported in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Based on the distribution of the MSDs at the orthopedic outpatient clinic of our hospital, as well as relevant clinical guidelines and literature on LBP [<xref ref-type="bibr" rid="ref22">22</xref>,<xref ref-type="bibr" rid="ref23">23</xref>], the following 6 major disease categories were identified: lumbar disc herniation (LDH), lumbar spinal stenosis (LSS), ankylosing spondylitis (AS), osteoporotic vertebral compression fracture (OVCF), infectious diseases of the spine (IDS), and metastatic spinal tumor (MST). After numbering the initially screened medical records, 20 cases were randomly selected from each disease category. Furthermore, 2 common non-MSD conditions, namely multiple myeloma (MM) and urinary system diseases (USD), were selected from the initial screenings based on clinical guidelines and literature. Detailed operational diagnostic criteria used for case inclusion and expert reference adjudication for each disease category are provided in <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>. A total of 20 medical records for each condition were randomly selected to form a non-MSD group, which was used to test the LLM&#x2019;s diagnostic performance for these disorders.</p></sec><sec id="s2-4"><title>LLMs and Prompt Design</title><p>Given their representative nature, advanced capabilities, mainstream adoption, and superior accessibility, 2 state-of-the-art LLMs, namely ChatGPT 5.1 and DeepSeek V3.2, were selected and accessed through their official websites. Advanced custom instructions or manual parameter adjustments were not used, ensuring the models were evaluated in their completely standard, default forms. The main features of these LLMs are detailed in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>. The performance of LLMs is highly contingent on prompt design, a factor that has given rise to the emerging field of &#x201C;prompt engineering,&#x201D; which provides evidence-based strategies to optimize model interactions. In accordance with these principles, we developed a standardized prompt protocol to establish a consistent question-answer framework, thereby enabling the models to perform as well as possible [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. Each model was instructed to assume the role of an MSDs specialist and draft responses that align with this research evidence and clinical best practices. Prompt design was based on simulated clinical decision-making scenarios typical of MSD outpatient practice [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>]. In both phases, the prompt was structured as a 2-step workflow; the model was first required to decide whether the presentation was MSD in origin using a binary response (yes or no); if &#x201C;no,&#x201D; it had to recommend the most appropriate referral department, and only if &#x201C;yes&#x201D; did it proceed to provide the most likely diagnosis and 3 differential diagnoses. Tailored prompts were generated according to the type of input, such as chief complaints or structured questionnaire responses. To improve transparency and reproducibility in the main paper, a representative example of the standardized prompt framework used in both phases has now been added as <xref ref-type="fig" rid="figure2">Figure 2</xref>, whereas the full verbatim prompts and complete examples are provided in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>. All personally identifiable information was removed. The standardized Phase 2 questionnaire comprised 7 domains and 33 items, ensuring consistent case presentation and improving data reliability. It covered general information, a one-sentence chief complaint, detailed symptom characteristics, associated symptoms, focused orthopedic signs, past medical history, and personal and social profile. Physical examination&#x2013;related elements were limited to focused orthopedic findings documented in the outpatient record, such as tenderness or percussion pain, gait change, neurogenic claudication, straight-leg-raise response, and stiffness. The questionnaire content was adapted from established LBP guidelines and finalized through consensus among 3 experienced clinicians [<xref ref-type="bibr" rid="ref22">22</xref>]; the full instrument is provided in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. Clinical records at our institution are routinely documented in Chinese. For the purpose of LLM evaluation, we extracted the required variables from the original Chinese records and compiled standardized case vignettes using a prespecified template. The vignettes were then translated into English by 2 bilingual spine surgeons (ML and AW) who independently performed forward translation. Discrepancies were resolved by consensus, and a third bilingual reviewer (JZ) conducted a final audit to ensure completeness, terminology consistency, and preservation of key clinical entities (symptoms, neurological findings, imaging descriptors, diagnoses, and red-flag features). To ensure compatibility with LLM processing, the questionnaire was generated using a structured prompt that captures the study objectives. All evaluations were conducted under a zero-shot setting, wherein no example questions or reference answers were provided. This design choice was intentional and reflects the clinical reality of initial outpatient triage. A zero-shot paradigm provides a stringent and unbiased assessment of the models&#x2019; intrinsic knowledge representation, eliminating the confounding influence of example selection bias inherent in few-shot prompting. Furthermore, it more faithfully replicates the unstructured, open-ended nature of real-world patient encounters compared to exemplar-driven benchmarks. This approach yields conservative estimates of model capability and improves the generalizability of findings. Prompt development and case sorting were undertaken by a single surgeon (RC, 3 years of experience) who completed a dedicated 3-day training program in December 2025. RC was responsible for preparing the prompts and organizing the case materials but did not participate in the subjective rationale-domain assessment. All prompts were executed on the same day (December 5, 2025) under identical conditions to minimize temporal variability. Each prompt was input into a new window in each LLM. Before evaluator scoring, all outputs underwent masking to reduce recognizable model-specific features. Identical prompts and response constraints were used for both models. Model-specific structural formatting (eg, excessive bolding, markdown-style headings, and distinctive line breaks) was removed, all outputs were converted into plain text, and generic filler phrases were manually deleted during data integration. The resulting responses were entered into a standardized evaluation grid. For the rationale-domain assessment, the 3 senior evaluators received only standardized plain-text outputs and were blinded to model identity, input phase, and case source. After masking and data integration, RC recorded the objective model-output results for triage accuracy, preliminary diagnosis accuracy, and differential diagnosis agreement according to the predefined scoring criteria. The Phase 2 explanatory rationales were independently assessed by 3 senior orthopedic surgeons: evaluator 1 (LZ), evaluator 2 (NF), and evaluator 3 (SY), who had 31, 25, and 20 years of surgical experience, respectively. These evaluators were responsible only for the blinded rationale-domain assessment and were blinded to model identity, input phase, and case source.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Representative examples of the standardized prompts used in Phase 1 and Phase 2. LLM: large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92315_fig02.png"/></fig></sec><sec id="s2-5"><title>Assessment of LLM Responses</title><p>The 2 models were separately evaluated for their ability to (1) identify MSDs, (2) provide a preliminary diagnosis, and (3) propose differential diagnoses across 2 phases (Phases 1 and 2). Scoring was applied across 3 domains: (1) triage accuracy (1 point for correct classification; for non-MSD cases, correctness required both non-MSD identification and referral to the appropriate department; otherwise, 0), (2) preliminary diagnosis accuracy (correct=1 and incorrect=0), and (3) differential diagnosis agreement (0&#x2010;3 points based on the number of proposed differentials matching the expert panel). Triage accuracy was defined as correct classification of the index presentation as musculoskeletal or nonmusculoskeletal; for nonmusculoskeletal cases, referral appropriateness was additionally assessed (urology for USD and hematology for MM). Preliminary diagnosis accuracy was defined as concordance between the model&#x2019;s diagnosis and the adjudicated index diagnosis. Differential diagnosis agreement was defined as the number of model-generated differential diagnoses (0&#x2010;3) matching the adjudicated expert differential list, without weighting the order of listing. For MSD cases that were not first classified as musculoskeletal by the model, differential diagnosis agreement was not scored because the model did not proceed through the predefined diagnostic pathway. Clinically equivalent synonymous expressions were accepted, whereas broader, incomplete, partial, or nonspecific labels were not credited. The same 3 evaluators (LZ, NF, and SY) conducted the Phase 2 multidimensional evaluation of the explanatory reasoning provided by the models for differential diagnoses. Cases triaged as non-MSD were excluded from this rationale-scoring evaluation because they did not generate differential diagnoses or corresponding differential-diagnosis rationales under the predefined prompt workflow. A 5-point Likert scale was applied to evaluate (1) relevance, (2) understanding and reasoning, (3) groundedness, (4) trust and satisfaction, and (5) harm. Each domain was rated using a 5-point Likert scale, with higher scores indicating better performance: 1=very poor (unacceptable or unsafe, major errors, or irrelevance), 2=poor (substantial deficiencies and limited usefulness), 3=fair (moderate quality, acceptable with notable limitations), 4=good (minor issues or clinically useful), and 5=excellent (highly accurate, clear, and trustworthy with no clinically meaningful errors). For the groundedness domain, raters assessed whether the rationale was supported by the case input or contained unsupported, contradictory, or factually incorrect clinical content. Representative examples are provided in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>, as this domain is particularly susceptible to subjective interpretation and therefore warrants more explicit case-based illustration to improve reproducibility and transparency. For the harm domain, higher scores indicated fewer potentially harmful recommendations and better safety; the domain-specific harm anchors are provided in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>. Interrater agreement among the 3 evaluators was assessed, with the mean score being used as the final value. Before conducting the evaluations, all evaluators were asked to thoroughly familiarize themselves with the evaluation checklist and standard recommendations and rationales. During the review procedure, the evaluators were blinded to the answer source. Because failure to recognize red-flag features may result in clinically important harm, including inappropriate reassurance, delayed referral, and delayed diagnostic workup, we conducted an additional safety analysis focusing on disease groups in the present cohort for which prior literature suggests that missed red-flag recognition carries particularly high clinical risk, specifically malignancy-, infection-, and fracture-related conditions [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. We performed a supplementary safety analysis based on prespecified red-flag conditions, informed by prior literature on LBP warning features for malignancy, infection, and fracture. Four disease categories in our cohort were included: MM, MST, IDS, and OVCF. A safety-risk error was defined as an incorrect triage or diagnostic output that could plausibly delay appropriate referral, further workup, or treatment. For this supplementary safety analysis, safety-risk classification was conducted independently by 2 evaluators (TW and YX) according to the predefined criteria. Any discordant judgments were adjudicated by a senior evaluator (SY) to ensure consistency. Safety-risk errors were counted separately by disease, model, and phase, and were displayed in two phase-specific bar charts.</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>All statistical analyses were conducted using SPSS software (version 31.0; IBM Corp), with 2-sided tests and the significance level set at <italic>P</italic>&#x003C;.05. Triage accuracy and preliminary diagnosis accuracy were presented as percentages and compared using the McNemar test. Given the equidistance of the 5-point Likert scale and differential diagnosis agreement, assessment data for the 5 domains of explanatory reasoning and differential diagnostic ability were presented as mean (SD). Ranked data were compared using the Mann-Whitney <italic>U</italic> test. For key pairwise comparisons, effect sizes with 95% CIs were reported in addition to <italic>P</italic> values. For binary endpoints (triage accuracy and preliminary diagnosis accuracy), paired effect size was expressed as risk difference (RD) in percentage points. The 95% CIs for paired RDs were calculated using the Newcombe hybrid-score method for paired proportions based on the full 2&#x00D7;2 paired classification table. Exact 2-sided McNemar <italic>P</italic> values were calculated where applicable. For differential diagnosis agreement, because some cases did not proceed to differential diagnosis after incorrect triage and the available sample sizes therefore varied across comparisons, effect size was expressed as Hedges <italic>g</italic>, with 95% CIs obtained by bootstrap resampling (4000 resamples); 2-sided <italic>P</italic> values were calculated using the Mann-Whitney <italic>U</italic> test. Exact <italic>P</italic> values are reported where possible, with <italic>P</italic>&#x003C;.001 shown when values fell below the reporting precision. Interrater agreement was assessed using the intraclass correlation coefficient test for absolute agreement, with values of &#x2265;0.90, &#x2265;0.75-&#x003C;0.90, &#x2265;0.50-&#x003C;0.75, and &#x003C;0.50 indicating excellent, good, moderate, and poor agreement, respectively.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>Retrospective review and screening identified a total of 160 cases presenting with predominant LBP. The cohort comprised patients from 8 diagnostic categories (20 cases each), with an overall sex distribution of 84 males and 76 females. Across categories, mean age ranged from 46.97 to 66.22 years, whereas mean BMI ranged from 20.15 to 24.95 kg/m&#x00B2; (<xref ref-type="supplementary-material" rid="app7">Multimedia Appendix 7</xref>). Preadjudication interrater agreement between the 2 surgeons was excellent for MSD identification (&#x03BA;=0.914, 95% CI 0.894&#x2010;0.925; n=455), good for preliminary diagnosis concordance (&#x03BA;=0.784, 95% CI 0.722&#x2010;0.813; n=423), and moderate for differential diagnosis concordance (&#x03BA;=0.607, 95% CI 0.543&#x2010;0.676; n=410; <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec><sec id="s3-2"><title>Triage Performance of LLMs</title><p>The chief complaint (Phase 1) and structured questionnaire (Phase 2) were entered into the LLMs to simulate initial presentation and subsequent detailed history-taking in outpatients with LBP. The primary triage comparisons were conducted at the overall level. In Phase 1, DeepSeek V3.2 and ChatGPT 5.1 achieved overall triage accuracy of 84.4% and 75.6%, respectively, which increased to 90.6% and 93.1% in Phase 2. Effect-size analysis showed a modest overall between-model difference in Phase 1 favoring DeepSeek V3.2 (RD &#x2212;8.8%, 95% CI &#x2212;16.9% to &#x2212;0.5%; <italic>P</italic>=.05), whereas the Phase 2 between-model difference was small and nonsignificant (RD 2.5%, 95% CI &#x2212;2.9% to 8.1%; <italic>P</italic>=.48). Within-model comparisons showed greater improvement from Phase 1 to Phase 2 for ChatGPT 5.1 (RD 17.5%, 95% CI 10.2%-24.9%; <italic>P</italic>=.001) than for DeepSeek V3.2 (RD 6.2%, 95% CI &#x2212;0.7% to 13.3%; <italic>P</italic>=.11).</p><p>At the disease level, the overall pattern was that both models performed well for several common MSDs, whereas performance was lower for diagnostically challenging non-MSD or red-flag presentations. In Phase 1, DeepSeek V3.2 achieved 100% triage accuracy for LDH, AS, and MST, with high accuracy for LSS (90%), OVCF (95%), IDS (75%), and USD (70%), but substantially lower accuracy for MM (45%). ChatGPT 5.1 showed a broadly similar pattern, but with lower Phase 1 accuracies for LSS (75%), OVCF (70%), IDS (55%), and USD (60%). In Phase 2, triage accuracy for MM improved to 55% for DeepSeek V3.2 and 60% for ChatGPT 5.1, but remained only moderate. By contrast, triage accuracy for MST decreased relative to Phase 1 (80% for DeepSeek V3.2 and 85% for ChatGPT 5.1), suggesting that additional but still incomplete clinical information may not uniformly improve discrimination for all high-risk conditions. For the remaining disease categories, triage performance in Phase 2 was generally high. DeepSeek V3.2 reached 100% accuracy for LSS, LDH, AS, OVCF, and USD, whereas ChatGPT 5.1 reached 100% accuracy for LSS, LDH, AS, OVCF, IDS, and USD. In contrast, performance remained lower for MM in both models and decreased relative to Phase 1 for MST, indicating that structured questionnaire input improved triage for most common or clinically typical presentations but did not uniformly resolve challenges in red-flag or diagnostically complex conditions. These disease-specific patterns are visually summarized in <xref ref-type="fig" rid="figure3">Figure 3A</xref> for Phase 1 and <xref ref-type="fig" rid="figure3">Figure 3</xref>B for Phase 2, which shows that most categories clustered at high triage accuracy in Phase 2, whereas MM and MST remained the main exceptions. Detailed disease-level paired comparisons are provided in <xref ref-type="supplementary-material" rid="app8">Multimedia Appendix 8</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Triage accuracy of DeepSeek V3.2 and ChatGPT 5.1 across 8 etiologies of low back pain. Musculoskeletal versus nonmusculoskeletal identification across etiologies under (A) Phase 1 input (chief complaint only) and (B) Phase 2 input (structured questionnaire). AS: ankylosing spondylitis; IDS: infectious diseases of the spine; LDH: lumbar disc herniation; LSS: lumbar spinal stenosis; MM: multiple myeloma; MST: metastatic spinal tumor; OVCF: osteoporotic vertebral compression fracture; USD: urinary system diseases.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92315_fig03.png"/></fig></sec><sec id="s3-3"><title>Preliminary Diagnosis Accuracy of LLMs</title><p>Preliminary diagnosis performance for specific MSDs was further evaluated. The primary diagnostic comparisons were prespecified at the overall level. In Phase 1, overall preliminary diagnostic accuracy was limited, at 48.3% for DeepSeek V3.2 and 35.0% for ChatGPT 5.1. In Phase 2, these values increased to 76.7% and 87.5%, respectively. Effect-size analysis confirmed a significant overall advantage of DeepSeek V3.2 over ChatGPT 5.1 in Phase 1 (RD &#x2212;13.3%, 95% CI &#x2212;22.5% to &#x2212;3.8%; <italic>P</italic>=.01), whereas ChatGPT 5.1 showed a modest but significant advantage in Phase 2 (RD 10.8%, 95% CI 2.5%-19.2%; <italic>P</italic>=.02). Within-model comparisons further showed marked improvements from Phase 1 to Phase 2 for both DeepSeek V3.2 (RD 28.3%, 95% CI 16.8%-38.8%; <italic>P</italic>=.001) and ChatGPT 5.1 (RD 52.5%, 95% CI 42.8%-60.6%; <italic>P</italic>&#x003C;.001).</p><p>At the disease level, both models showed heterogeneous performance across conditions, with the greatest gains generally seen in diseases requiring more structured clinical context for discrimination. In Phase 1, DeepSeek V3.2 yielded higher preliminary diagnostic accuracy than ChatGPT 5.1 for LSS (40% vs 25%), LDH (85% vs 80%), OVCF (60% vs 10%), IDS (45% vs 30%), and MST (25% vs 15%); among these, the difference for OVCF was statistically significant. In Phase 2, preliminary diagnostic accuracy improved in nearly all disease categories for both models. For LDH, both models reached 100% accuracy after structured questionnaire input. ChatGPT 5.1 showed numerically higher Phase 2 accuracy than DeepSeek V3.2 for LSS (80% vs 75%), AS (85% vs 80%), OVCF (100% vs 85%), IDS (85% vs 65%), and MST (75% vs 55%), although the between-model differences for these individual diseases were not statistically significant. Notably, OVCF represented one of the largest phase-dependent changes, particularly for ChatGPT 5.1, which increased from 10% in Phase 1 to 100% in Phase 2. Similarly, MST and IDS showed clear improvement after structured input, but preliminary diagnostic accuracy remained imperfect, indicating persistent difficulty with some high-risk or information-dependent conditions. Detailed disease-level comparisons are shown separately for Phase 1 in <xref ref-type="fig" rid="figure4">Figure 4</xref>A and Phase 2 in <xref ref-type="fig" rid="figure4">Figure 4</xref>B, with additional results provided in <xref ref-type="supplementary-material" rid="app9">Multimedia Appendix 9</xref>.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Preliminary diagnostic accuracy of DeepSeek V3.2 and ChatGPT 5.1 for the 6 musculoskeletal etiologies of low back pain under (A) Phase 1 (chief complaint) and (B) Phase 2 (structured questionnaire). AS: ankylosing spondylitis; IDS: infectious diseases of the spine; LDH: lumbar disc herniation; LSS: lumbar spinal stenosis; MST: metastatic spinal tumor; OVCF: osteoporotic vertebral compression fracture.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92315_fig04.png"/></fig></sec><sec id="s3-4"><title>Differential Diagnosis Performance of LLMs</title><p>The ability of the LLMs to generate correct differential diagnoses for MSDs was also evaluated. Because some cases did not proceed to differential diagnosis after incorrect triage, available sample sizes varied across comparisons. The primary comparisons were therefore summarized at the overall level. In Phase 1, differential diagnosis agreement did not differ significantly between the models (DeepSeek V3.2: mean 1.27, SD 0.71; ChatGPT 5.1: mean 1.34, SD 0.70; Hedges <italic>g</italic>=0.10; <italic>P</italic>=.48). In Phase 2, both models improved significantly (DeepSeek V3.2: mean 2.02, SD 0.74; ChatGPT 5.1: mean 2.03, SD 0.77), with large within-model effect sizes (overall Hedges <italic>g</italic>=1.03 for DeepSeek V3.2 and 0.93 for ChatGPT 5.1; both <italic>P</italic>&#x003C;.001). By contrast, the overall between-model differences remained small and non-significant in both phases (Phase 1: Hedges <italic>g</italic>=0.10; <italic>P</italic>=.48; Phase 2: Hedges <italic>g</italic>=0.01; <italic>P</italic>=.80).</p><p>At the disease level, the pattern of differential diagnoses agreement was more variable. In Phase 1, LDH yielded the highest scores in both models (DeepSeek V3.2: mean 1.80, SD 0.60; ChatGPT 5.1: mean 1.90, SD 0.62), whereas IDS had the lowest scores (DeepSeek V3.2: mean 0.80, SD 0.54; ChatGPT 5.1: mean 0.91, SD 0.79), indicating that infectious presentations were especially difficult when only chief-complaint information was available. In Phase 2, scores increased across most conditions for both models, but gains were limited for MST, which remained the lowest-scoring disease in both models (DeepSeek V3.2: mean 1.19, SD 0.39; ChatGPT 5.1: mean 1.18, SD 0.86). The clearest between-model disease-level difference was observed for LSS, for which ChatGPT 5.1 scored significantly higher than DeepSeek V3.2 in both Phase 1 and Phase 2. By contrast, most other disease-level between-model comparisons were not statistically significant. These results suggest that structured clinical input substantially improved differential diagnosis performance overall, but that the degree of improvement still varied by disease category, with limited gains in certain complex red-flag conditions such as MST. Full disease-level comparisons are shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>A for disease-level agreement in Phase 1, <xref ref-type="fig" rid="figure5">Figure 5</xref>B for disease-level agreement in Phase 2, and <xref ref-type="fig" rid="figure5">Figure 5</xref>C for overall agreement by phase, with additional results provided in <xref ref-type="supplementary-material" rid="app10">Multimedia Appendix 10</xref>.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Differential diagnosis agreement of DeepSeek V3.2 and ChatGPT 5.1 with the expert reference standard. Results are shown for (A) disease-level agreement in Phase 1 (chief complaint), (B) disease-level agreement in Phase 2 (structured questionnaire), and (C) overall agreement by phase. AS: ankylosing spondylitis; IDS: infectious diseases of the spine; LDH: lumbar disc herniation; LSS: lumbar spinal stenosis; MST: metastatic spinal tumor; OVCF: osteoporotic vertebral compression fracture.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92315_fig05.png"/></fig></sec><sec id="s3-5"><title>Reasoning and Explanatory Evaluation of LLMs</title><p>The overall interrater agreement among 3 evaluators scoring the LLM&#x2019;s rationales ranged from moderate to excellent. DeepSeek V3.2 showed intraclass correlation coefficients of 0.773 (relevance), 0.852 (understanding and reasoning), 0.781 (groundedness), 0.875 (harm), and 0.943 (trust and satisfaction). Corresponding intraclass correlation coefficients for ChatGPT 5.1 were 0.924, 0.789, 0.758, 0.901, and 0.952 (all 95% CIs as reported; <xref ref-type="supplementary-material" rid="app11">Multimedia Appendix 11</xref>). Across domains, the mean scores indicated good performance for both models (range 3.88&#x2010;4.88). DeepSeek V3.2 achieved scores of mean 4.08 (SD 0.58; relevance), 4.14 (SD 0.59; understanding and reasoning), 4.88 (SD 0.20; groundedness), 4.18 (SD 0.46; harm), and 3.96 (SD 0.80; trust and satisfaction), whereas ChatGPT 5.1 obtained mean scores of 4.04 (SD 0.79), 4.55 (SD 0.56), 4.87 (SD 0.30), 4.21 (SD 0.44) and 3.88 (SD 0.82), respectively. ChatGPT 5.1 achieved significantly higher scores in the understanding and reasoning domain than did DeepSeek V3.2 (<italic>P</italic>=.01). A visual summary is provided in <xref ref-type="fig" rid="figure6">Figure 6</xref>, with full details presented in <xref ref-type="supplementary-material" rid="app12">Multimedia Appendix 12</xref>.</p><fig position="float" id="figure6"><label>Figure 6.</label><caption><p>Reasoning and explanatory evaluation of model outputs for DeepSeek V3.2 and ChatGPT 5.1. Three blinded senior orthopedic evaluators independently rated model rationales using a 5-point Likert scale across 5 predefined domains. Asterisk (*) denotes <italic>P</italic>&#x003C;.05.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e92315_fig06.png"/></fig></sec><sec id="s3-6"><title>Safety-Focused Analysis of LLMs</title><p>To complement the main performance analysis, we conducted a supplementary safety-focused analysis of 4 prespecified red-flag conditions: MM, MST, IDS, and OVCF. Overall, the total number of safety-risk cases was 38 for DeepSeek V3.2 and 48 for ChatGPT 5.1 in Phase 1, decreasing to 28 and 16, respectively, in Phase 2. Thus, structured questionnaire input reduced the overall safety-risk burden by 10 cases (26.3%) for DeepSeek V3.2 and by 32 cases (66.7%) for ChatGPT 5.1. At the disease level, MST had the highest overall safety-risk burden across models and phases (45 cases in total), followed by MM (37 cases), IDS (27 cases), and OVCF (21 cases). In Phase 1, MST accounted for the largest number of safety-risk cases in both models (14 cases for DeepSeek V3.2 and 16 cases for ChatGPT 5.1). In Phase 2, the highest safety-risk burden remained in MST and MM for DeepSeek V3.2 (9 cases each), whereas MM showed the highest residual burden for ChatGPT 5.1 (8 cases). For DeepSeek V3.2, the number of safety-risk cases decreased from 7 to 2 for OVCF, from 14 to 9 for MST, and from 11 to 9 for MM, but increased from 6 to 8 for IDS. For ChatGPT 5.1, safety-risk cases decreased across all 4 red-flag conditions, from 12 to 0 for OVCF, 11 to 2 for IDS, 16 to 6 for MST, and 9 to 8 for MM. Notably, although Phase II reduced the overall number of safety-risk cases for both models, clinically important residual errors remained, particularly for MM and MST. These findings indicate that structured clinical information improved safety-related performance, but did not fully eliminate clinically dangerous errors in red-flag conditions. Disease-specific counts are shown in <xref ref-type="supplementary-material" rid="app13">Multimedia Appendix 13</xref>.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study systematically evaluated the triage and diagnostic capabilities of DeepSeek V3.2 and ChatGPT 5.1 for patients with LBP based on real-world clinical data in a simulated outpatient setting. The results demonstrated that even based solely on the chief complaint, both models exhibited acceptable ability for disease recognition. Structured questionnaire input generally enhanced model performance, particularly for preliminary diagnostic accuracy and differential diagnosis agreement, whereas its impact on triage accuracy was model-dependent and reached statistical significance only for ChatGPT 5.1. From a practical workflow perspective, LLMs may be most useful as front-end support in LBP clinics: (1) transforming unstructured chief complaints into structured history templates; (2) prompting red-flag screening and recommending appropriate next-step tests; and (3) suggesting referral departments when non-MSD etiologies are suspected. Importantly, the consistent Phase 1 to Phase 2 gain, designed to mirror first-visit LBP encounters where decisions often start from information-sparse complaints, highlights that LLM performance is strongly information-dependent and can be materially improved by structured intake, which is a defining feature of our study.</p><p>Previous studies have explored the performance of LLMs in the diagnosis of orthopedic diseases using various information formats, such as chief complaints, structured questionnaires, and complete medical records [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>]. Kunze et al [<xref ref-type="bibr" rid="ref31">31</xref>] reported that ChatGPT 4 provided clinically reasonable differential diagnoses and triage recommendations based solely on the chief complaint of knee joint pain, achieving a diagnostic accuracy of 70%. Moreover, supplementation with additional information, such as age or medical history, increased the accuracy rate to 100%. Pagano et al [<xref ref-type="bibr" rid="ref32">32</xref>] demonstrated that LLMs could achieve a diagnostic sensitivity of 92.3% using self-reported data from structured questionnaires collected from patients with hip and knee osteoarthritis. Other studies have shown that when complete outpatient records were input, including symptoms, physical examination, radiological interpretation, and expert treatment recommendations, ChatGPT 4 achieved a completely accurate diagnosis [<xref ref-type="bibr" rid="ref33">33</xref>]. These studies collectively highlight the potential of LLMs as support tools for clinical triage and decision-making. Although our results did not surpass those reported previously, they still confirmed the promising application prospects of LLMs during the initial outpatient triage. The performance differences may be attributed to 2 factors. First, this study focused on the initial consultation scenario, with relatively limited input information, unlike the detailed medical records used in previous studies. Second, LBP has a more complex etiology, as well as more nonspecific symptoms, than do knee or hip joint diseases, making diagnosis more challenging. Despite the limited information, nonspecific symptoms, and multifactorial etiology, the LLMs were still able to maintain a certain level of diagnostic efficacy, suggesting their robustness and potential value in complex clinical settings.</p><p>However, several specific issues warrant further consideration. First, model performance varied substantially across disease categories. In general, both LLMs performed better for conditions with relatively typical clinical presentations and clearer diagnostic pathways, but struggled with diseases characterized by subtle manifestations or more complex differential diagnosis. MM is a representative example. Because MM may initially present as nonspecific LBP, it can be easily confused with common degenerative conditions. In routine practice, its diagnosis depends heavily on laboratory findings, radiological clues, and, in many cases, bone marrow aspiration or biopsy [<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>]. These key data are typically unavailable at the time of initial outpatient triage, which helps explain the persistently limited performance of both models and underscores the continued need for clinician oversight. Second, from the perspective of each phase, the performance of LLMs was highly dependent on the completeness of the information [<xref ref-type="bibr" rid="ref32">32</xref>]. When relying solely on the chief complaint, the models predominantly leaned toward common diseases, showing good recognition of typical degenerative conditions but lower accuracy for diseases that require specific tests. After structured questionnaire input was added, the models showed marked improvements in preliminary diagnosis accuracy and differential diagnosis agreement. Notably, the triage accuracy for MST was lower in Phase 2 than in Phase 1, suggesting that more detailed information might introduce interference, testing the model&#x2019;s ability to extract key features. This finding is clinically plausible given that the presenting complaints and &#x201C;red-flag&#x201D; symptom patterns of MST can overlap substantially with those of hematologic malignancies, such as MM (eg, persistent back pain, constitutional symptoms, anemia-related fatigue, and nonspecific neurologic complaints), which complicates discrimination based on history alone. Furthermore, the most discriminative diagnostic cues for MST are often not purely symptom-based but rather depend on objective evidence, including characteristic imaging findings (eg, destructive lesions or epidural involvement), laboratory markers, and confirmatory tests (eg, advanced imaging, tumor markers, or biopsy). Therefore, when richer but still incomplete clinical narratives are provided, such as in Phase 2, the model may overweight nonspecific features and be &#x201C;distracted&#x201D; toward competing malignant etiologies (particularly MM), leading to mistriage. This pattern reflects an important clinical reality: the information available at the initial outpatient encounter is often incomplete. In this study, the LLMs were intentionally evaluated under such information-limited conditions to determine whether they could provide reasonable early triage and differential diagnostic support for patients presenting with suspected MSD-related LBP. By contrast, the expert reference standard was established using more complete clinical information to ensure diagnostic consistency and a stable benchmark for comparison. Although this design was necessary for evaluation, it also means that strict comparability between expert adjudication and model output is inherently limited, especially for red-flag conditions that often require imaging, laboratory testing, or subsequent inpatient workup for confirmation. Accordingly, future work should incorporate structured red-flag fields and high-yield objective data (key laboratory indices and standardized imaging descriptors or direct image inputs where appropriate) and evaluate multimodal or rule-constrained prompting strategies to improve the diagnostic performance of LLMs under information-dense scenarios. Third, safety deserves specific emphasis. Our supplementary safety analysis showed that structured questionnaire input reduced the number of safety-risk errors in both models; however, clinically important residual errors persisted, particularly for MM and MST. This finding indicates that gains in overall triage or diagnostic accuracy do not necessarily translate into adequate safety for high-risk presentations. In practice, such diagnostic failures may lead to false patient reassurance and critical delays in referrals or workups. Consequently, our findings support using LLMs solely as adjunctive tools for preliminary triage rather than as autonomous systems for evaluating potential &#x201C;red-flag&#x201D; conditions. A fundamental challenge remains that high-risk MSDs often cannot be reliably distinguished from common degenerative conditions using text alone, especially when symptoms are nonspecific. Furthermore, LLMs may default to common musculoskeletal explanations when information is incomplete, increasing the risk of missing rare but dangerous conditions. Future research should therefore prioritize reducing high-consequence errors through structured red-flag screening, explicit escalation protocols, and the integration of multimodal data within real-world clinical workflows. Fourth, the 2 models appeared to show somewhat different strengths. DeepSeek V3.2 performed better when only chief-complaint information was available, whereas ChatGPT 5.1 demonstrated stronger reasoning and diagnostic performance after structured input was added. This finding suggests that, with further refinement and validation, future clinical decision support systems may potentially be designed to dynamically select or combine models based on task characteristics, building a collaborative framework that leverages complementary strengths. Nevertheless, such an approach remains hypothetical and would require substantial technical, regulatory, and workflow development before practical implementation. Fifth, the models may have used demographic heuristics, particularly age and sex, to support some diagnostic judgments. This should be acknowledged when interpreting model performance. In this cohort, certain disease groups showed relatively distinct demographic clustering; for example, AS tended to occur in younger patients, whereas OVCF was more common in older patients. Such cues are also a legitimate part of real-world clinical reasoning rather than an invalid shortcut. Many clinically relevant distinctions in LBP require the integration of symptom profiles, associated features, focused physical findings, and contextual history. For example, differentiating common degenerative disease from spinal infection, MST, or other red-flag conditions cannot be reliably accomplished on the basis of age or sex alone. The marked performance gains observed after structured questionnaire input in Phase 2 therefore suggest that the models were not relying exclusively on simple demographic associations, but were also synthesizing richer clinical information when it was made available. Future studies should therefore incorporate feature ablation, demographic masking, or counterfactual case perturbation designs to better distinguish statistical heuristic use from more robust clinical reasoning.</p><p>In addition, model performance should not be judged solely by endpoint accuracy [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Our multidimensional assessment of the explanatory rationales showed that both models performed well overall in relevance, understanding and reasoning, groundedness, harm and trust, and satisfaction, with groundedness approaching the maximum score. This finding indicates that their outputs were highly reliable under structured inputs. Notably, ChatGPT 5.1 significantly outperformed DeepSeek V3.2 in understanding and reasoning (<italic>P</italic>&#x003C;.05), reflecting stronger capabilities in integrating clinical information and logical inference, which is consistent with its higher diagnostic accuracy in the structured questionnaire phase. However, both models achieved relatively lower scores in trust and satisfaction than in the other domains, suggesting that clinicians remain cautious about AI-assisted decision-making and that further efforts are needed to enhance their clinical credibility and practical utility.</p><p>Overall, our findings support the potential of LLMs as clinician-assisting tools in LBP triage, while also underscoring the considerable practical and governance barriers that remain for safe and responsible real-world implementation. First, the diagnosis of LBP-related diseases heavily relies on the integrated use of physical examination and radiological evaluation [<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>]. However, most current LLMs are limited to pure text-based interactions, constraining their potential for direct application in real clinical settings. Future developments in LLMs should transcend text-only models to support multimodal inputs, integrating symptoms, signs, and imaging data, while being embedded within clinical electronic medical record systems to serve as real-time decision support tools for clinicians [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. Such progress could facilitate human-AI collaboration in triage and differential diagnosis, potentially improving early detection and triage efficiency, especially in primary care and resource-limited settings. Second, it is worth emphasizing that the value of AI in clinical practice lies not in replacing clinicians, but in collaborating with them as a &#x201C;copilot.&#x201D; In this human-in-the-loop workflow, the core role of AI is to provide differential diagnoses, identify potentially complex cases, and assist in information integration, thereby expanding clinicians&#x2019; cognitive boundaries and enhancing decision-making efficiency. However, all AI-generated outputs still require final interpretation and judgment by clinicians in light of the patient&#x2019;s specific clinical context and the clinician&#x2019;s own expertise. At the same time, real-world clinical deployment of LLMs raises important practical and medico-legal concerns that extend beyond diagnostic performance alone. Even when used as decision-support tools, LLM outputs may introduce automation bias, inappropriate overreliance, or delayed escalation if plausible but incorrect recommendations are accepted without sufficient verification [<xref ref-type="bibr" rid="ref41">41</xref>]. In addition, accountability remains insufficiently defined when patient harm occurs in AI-assisted care, because responsibility may be distributed across clinicians, institutions, developers, and platform providers, whereas current legal and regulatory frameworks are still adapting to generative AI in medicine [<xref ref-type="bibr" rid="ref42">42</xref>,<xref ref-type="bibr" rid="ref43">43</xref>]. Clinical implementation also requires more than accuracy alone; it depends on traceability, transparent documentation of model use, data governance, privacy protection, and clearly defined escalation pathways for unsafe or uncertain outputs [<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. Accordingly, before LLMs can be integrated into routine musculoskeletal triage workflows, future research should move beyond retrospective performance studies and include prospective implementation studies that evaluate safety monitoring, human oversight, workflow integration, and accountability structures under real clinical conditions [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref45">45</xref>-<xref ref-type="bibr" rid="ref48">48</xref>]. Further work should also explore how human-AI collaboration affects diagnostic quality, efficiency, and clinicians&#x2019; trust in AI in real-world practice. Finally, it is also important to note that disease prevalence may materially influence LLM evaluation. In LBP populations, prevalence affects the clinical interpretability of aggregate performance and may change the apparent value of a model, particularly when rare but high-risk conditions are oversampled. In this study, a balanced dataset was used to support fairer disease-level comparison and to focus on clinically important red-flag conditions. However, this design does not reflect the true prevalence structure of routine outpatient practice. Future studies should therefore prospectively validate these models in prevalence-representative LBP outpatient cohorts and further examine the real-world impact of AI assistance on diagnostic quality, efficiency, and clinicians&#x2019; trust in AI.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study still has several limitations. First, both the model inputs and the reference standard were based on retrospective documentation rather than real-time doctor-patient interaction. Although the structured questionnaire was derived from actual medical records, it could not fully reproduce the ambiguity, incompleteness, and contextual complexity of real outpatient communication [<xref ref-type="bibr" rid="ref49">49</xref>]. In addition, the expert benchmark was based on a single preliminary diagnosis, which provided a stable reference for evaluation but may not fully capture multimorbidity or presentations in which multiple concurrent conditions contribute to symptoms. Second, the single-center, retrospective design, limited sample size, and restricted disease spectrum constrain generalizability. Moreover, although the balanced case mix allowed fairer disease-level comparison, it does not reflect the true prevalence structure of LBP in routine practice and may therefore introduce spectrum bias. The current findings should be interpreted as a controlled comparative evaluation under balanced conditions rather than as a direct estimate of real-world triage or diagnostic performance. In addition, our non-MSD triage scoring used a rigid prespecified referral mapping, requiring hematology for MM and urology for USD. Broader but potentially clinically reasonable referrals, such as internal medicine or oncology for suspected MM, were not credited, which may have modestly underestimated triage accuracy for non-MSD conditions. Third, model performance may not have been fully optimized [<xref ref-type="bibr" rid="ref24">24</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. Prompt design was informed by relevant guidance, but alternative prompting strategies (eg, chain-of-thought) were not systematically compared, and the specialist role framing may have influenced model priors, particularly under information-sparse conditions. In addition, although structured masking and a clinician-reviewed forward-translation workflow were applied, formal back-translation was not performed. Therefore, subtle linguistic shifts and residual stylistic cues may still have influenced model outputs and partially compromised evaluator blinding. Fourth, we did not assess computational efficiency, latency, token usage, or cost, and the rapid evolution of LLMs raises the possibility of model drift, limiting the long-term stability of these findings. Furthermore, this study design did not include ablation experiments to isolate the exact contribution of demographic cues from deeper causal reasoning. Multiple subgroup and disease-level comparisons were performed without formal adjustment for multiplicity, so nominally significant findings should be interpreted cautiously, with greater emphasis on effect sizes, 95% CI, and overall patterns of results. Regarding the prompting paradigm, diagnostic accuracy may be higher in deployed settings where iterative questioning or few-shot exemplars are available, particularly for red-flag conditions; however, these approaches may also introduce additional challenges, including anchoring bias and the need for predefined stopping criteria. Finally, ethical and governance issues surrounding patient data use and clinical deployment also remain important barriers to near-term implementation.</p></sec><sec id="s4-3"><title>Conclusions</title><p>Both ChatGPT 5.1 and DeepSeek V3.2 demonstrated potential in text-based triage and differential diagnosis of MSDs for LBP, with structured clinical information generally improving performance, particularly for preliminary diagnosis accuracy and differential diagnosis agreement. However, their limited sensitivity for red-flag conditions such as MM highlights significant safety risks, cautioning against their use as independent triage tools. ChatGPT 5.1 showed stronger reasoning with structured inputs based on rationale ratings, whereas DeepSeek V3.2 showed better performance under chief-complaint-only input, with significantly higher Phase 1 preliminary diagnostic accuracy and numerically higher Phase 1 triage accuracy. These findings underscore the need for further model refinement, rigorous prospective validation, and integration with clinician oversight before any clinical application.</p></sec></sec></body><back><ack><p>The authors disclose that ChatGPT 5.1 (OpenAI) was used only for language polishing (improving grammar, clarity, and concision) during manuscript preparation. All authors reviewed and edited all artificial intelligence&#x2013;assisted text and take full responsibility for the final content. No generative AI was used for data collection, data analysis, visualization, or the generation of study data, analyses, or results. No identifiable participant data or raw transcripts were entered into the tool.</p></ack><notes><sec><title>Funding</title><p>The authors declared no financial support was received for this work.</p></sec><sec><title>Data Availability</title><p>The datasets generated during and/or analyzed during this study are available from the corresponding author on reasonable request.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: ZM, RC, TW, and LZ</p><p>Data curation: RC, ZM, ML, AW, and JZ</p><p>Formal analysis: RC, ZM, and YX</p><p>Investigation: ZM, RC, AW, ML, YX, and JZ</p><p>Methodology: ZM, RC, TW, and LZ</p><p>Project administration: TW and LZ</p><p>Resources: LZ, TW, NF, and SY</p><p>Supervision: TW, LZ, NF, and SY</p><p>Validation: ZM, ML, LZ, NF, SY, TW, and YX</p><p>Visualization: ZM, RC, and AW</p><p>Writing &#x2013; original draft: ZM and RC</p><p>Writing &#x2013; review &#x0026; editing: ZM, RC, AW, YX, ML, SY, NF, JZ, TW, and LZ</p><p>TW is the co-corresponding author for this work and can be reached via email at 921158355@qq.com.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">AS</term><def><p>ankylosing spondylitis</p></def></def-item><def-item><term id="abb3">IDS</term><def><p>infectious diseases of the spine</p></def></def-item><def-item><term id="abb4">LBP</term><def><p>low back pain</p></def></def-item><def-item><term id="abb5">LDH</term><def><p>lumbar disc herniation</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb8">LSS</term><def><p>lumbar spinal stenosis</p></def></def-item><def-item><term id="abb9">MM</term><def><p>multiple myeloma</p></def></def-item><def-item><term id="abb10">MSD</term><def><p>musculoskeletal disorder</p></def></def-item><def-item><term id="abb11">MST</term><def><p> metastatic spinal tumor</p></def></def-item><def-item><term id="abb12">OVCF</term><def><p>osteoporotic vertebral compression fracture</p></def></def-item><def-item><term id="abb13">RD</term><def><p>risk difference</p></def></def-item><def-item><term id="abb14">TRIPOD-LLM</term><def><p>Transparent Reporting of a multivariable prediction model for Individual Prognosis Or Diagnosis&#x2013;Large Language Model</p></def></def-item><def-item><term id="abb15">USD</term><def><p>urinary system disease</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Williams</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kamper</surname><given-names>SJ</given-names> </name><name name-style="western"><surname>Wiggers</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>Musculoskeletal conditions may increase the risk of chronic disease: a systematic review and meta-analysis of cohort studies</article-title><source>BMC Med</source><year>2018</year><month>09</month><day>25</day><volume>16</volume><issue>1</issue><fpage>167</fpage><pub-id pub-id-type="doi">10.1186/s12916-018-1151-2</pub-id><pub-id pub-id-type="medline">30249247</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vollset</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Goren</surname><given-names>E</given-names> </name><name name-style="western"><surname>Yuan</surname><given-names>CW</given-names> </name><etal/></person-group><article-title>Fertility, mortality, migration, and population scenarios for 195 countries and territories from 2017 to 2100: a forecasting analysis for the Global Burden of Disease Study</article-title><source>Lancet</source><year>2020</year><month>10</month><day>17</day><volume>396</volume><issue>10258</issue><fpage>1285</fpage><lpage>1306</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(20)30677-2</pub-id><pub-id pub-id-type="medline">32679112</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>P</given-names> </name><name name-style="western"><surname>Rodriguez</surname><given-names>EK</given-names> </name><name name-style="western"><surname>Chahal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Freedman</surname><given-names>BR</given-names> </name><name name-style="western"><surname>Nazarian</surname><given-names>A</given-names> </name></person-group><article-title>Addressing the growing burden of musculoskeletal diseases in the ageing US population: challenges and innovations</article-title><source>Lancet Healthy Longev</source><year>2025</year><month>05</month><volume>6</volume><issue>5</issue><fpage>100707</fpage><pub-id pub-id-type="doi">10.1016/j.lanhl.2025.100707</pub-id><pub-id pub-id-type="medline">40381641</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>AT</given-names> </name><name name-style="western"><surname>Aris</surname><given-names>IM</given-names> </name><name name-style="western"><surname>Snyder</surname><given-names>BD</given-names> </name><etal/></person-group><article-title>Musculoskeletal health: an ecological study assessing disease burden and research funding</article-title><source>Lancet Reg Health Am</source><year>2024</year><month>01</month><volume>29</volume><fpage>100661</fpage><pub-id pub-id-type="doi">10.1016/j.lana.2023.100661</pub-id><pub-id pub-id-type="medline">38225979</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>I</given-names> </name><name name-style="western"><surname>Wiles</surname><given-names>L</given-names> </name><name name-style="western"><surname>Waller</surname><given-names>R</given-names> </name><etal/></person-group><article-title>What does best practice care for musculoskeletal pain look like? Eleven consistent recommendations from high-quality clinical practice guidelines: systematic review</article-title><source>Br J Sports Med</source><year>2020</year><month>01</month><volume>54</volume><issue>2</issue><fpage>79</fpage><lpage>86</lpage><pub-id pub-id-type="doi">10.1136/bjsports-2018-099878</pub-id><pub-id pub-id-type="medline">30826805</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lowe</surname><given-names>C</given-names> </name><name name-style="western"><surname>Atherton</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lloyd</surname><given-names>P</given-names> </name><name name-style="western"><surname>Waters</surname><given-names>A</given-names> </name><name name-style="western"><surname>Morrissey</surname><given-names>D</given-names> </name></person-group><article-title>Improving safety, efficiency, cost, and satisfaction across a musculoskeletal pathway using the digital assessment routing tool for triage: quality improvement study</article-title><source>J Med Internet Res</source><year>2025</year><month>04</month><day>25</day><volume>27</volume><fpage>e67269</fpage><pub-id pub-id-type="doi">10.2196/67269</pub-id><pub-id pub-id-type="medline">40279646</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Joseph</surname><given-names>C</given-names> </name><name name-style="western"><surname>Morrissey</surname><given-names>D</given-names> </name><name name-style="western"><surname>Abdur-Rahman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Hussenbux</surname><given-names>A</given-names> </name><name name-style="western"><surname>Barton</surname><given-names>C</given-names> </name></person-group><article-title>Musculoskeletal triage: a mixed methods study, integrating systematic review with expert and patient perspectives</article-title><source>Physiotherapy</source><year>2014</year><month>12</month><volume>100</volume><issue>4</issue><fpage>277</fpage><lpage>289</lpage><pub-id pub-id-type="doi">10.1016/j.physio.2014.03.007</pub-id><pub-id pub-id-type="medline">25242531</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zuo</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Evolution of the &#x201C;Internet Plus Health Care&#x201D; mode enabled by artificial intelligence: development and application of an outpatient triage system</article-title><source>J Med Internet Res</source><year>2024</year><month>10</month><day>30</day><volume>26</volume><fpage>e51711</fpage><pub-id pub-id-type="doi">10.2196/51711</pub-id><pub-id pub-id-type="medline">39476375</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gaber</surname><given-names>F</given-names> </name><name name-style="western"><surname>Shaik</surname><given-names>M</given-names> </name><name name-style="western"><surname>Allega</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Evaluating large language model workflows in clinical decision support for triage and referral and diagnosis</article-title><source>NPJ Digit Med</source><year>2025</year><month>05</month><day>9</day><volume>8</volume><issue>1</issue><fpage>263</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01684-1</pub-id><pub-id pub-id-type="medline">40346344</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Benary</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>XD</given-names> </name><name name-style="western"><surname>Schmidt</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Leveraging large language models for decision support in personalized oncology</article-title><source>JAMA Netw Open</source><year>2023</year><month>11</month><day>1</day><volume>6</volume><issue>11</issue><fpage>e2343689</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2023.43689</pub-id><pub-id pub-id-type="medline">37976064</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Adejumo</surname><given-names>P</given-names> </name><name name-style="western"><surname>Thangaraj</surname><given-names>PM</given-names> </name><name name-style="western"><surname>Dhingra</surname><given-names>LS</given-names> </name><etal/></person-group><article-title>Natural language processing of clinical documentation to assess functional status in patients with heart failure</article-title><source>JAMA Netw Open</source><year>2024</year><month>11</month><day>4</day><volume>7</volume><issue>11</issue><fpage>e2443925</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.43925</pub-id><pub-id pub-id-type="medline">39509128</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Application of large language models in medical training evaluation-using ChatGPT as a standardized patient: multimetric assessment</article-title><source>J Med Internet Res</source><year>2025</year><month>01</month><day>1</day><volume>27</volume><fpage>e59435</fpage><pub-id pub-id-type="doi">10.2196/59435</pub-id><pub-id pub-id-type="medline">39742453</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><article-title>ChatGPT: friend or foe?</article-title><source>Lancet Digit Health</source><year>2023</year><month>03</month><volume>5</volume><issue>3</issue><fpage>e102</fpage><pub-id pub-id-type="doi">10.1016/S2589-7500(23)00023-7</pub-id><pub-id pub-id-type="medline">36754723</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tam</surname><given-names>TYC</given-names> </name><name name-style="western"><surname>Sivarajkumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kapoor</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A framework for human evaluation of large language models in healthcare derived from literature review</article-title><source>NPJ Digit Med</source><year>2024</year><month>09</month><day>28</day><volume>7</volume><issue>1</issue><fpage>258</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01258-7</pub-id><pub-id pub-id-type="medline">39333376</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Tokumasu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ito</surname><given-names>T</given-names> </name><name name-style="western"><surname>Suzuki</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Comparative study to evaluate the accuracy of differential diagnosis lists generated by Gemini Advanced, Gemini, and Bard for a case report series analysis: cross-sectional study</article-title><source>JMIR Med Inform</source><year>2024</year><month>10</month><day>2</day><volume>12</volume><fpage>e63010</fpage><pub-id pub-id-type="doi">10.2196/63010</pub-id><pub-id pub-id-type="medline">39357052</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gilson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Safranek</surname><given-names>CW</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>T</given-names> </name><etal/></person-group><article-title>How does ChatGPT Perform on the United States Medical Licensing Examination (USMLE)? The implications of large language models for medical education and knowledge assessment</article-title><source>JMIR Med Educ</source><year>2023</year><month>02</month><day>8</day><volume>9</volume><fpage>e45312</fpage><pub-id pub-id-type="doi">10.2196/45312</pub-id><pub-id pub-id-type="medline">36753318</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sonoda</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Kurokawa</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hagiwara</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Structured clinical reasoning prompt enhances LLM&#x2019;s diagnostic capabilities in diagnosis please quiz cases</article-title><source>Jpn J Radiol</source><year>2025</year><month>04</month><volume>43</volume><issue>4</issue><fpage>586</fpage><lpage>592</lpage><pub-id pub-id-type="doi">10.1007/s11604-024-01712-2</pub-id><pub-id pub-id-type="medline">39625594</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Scaff</surname><given-names>SPS</given-names> </name><name name-style="western"><surname>Reis</surname><given-names>FJJ</given-names> </name><name name-style="western"><surname>Ferreira</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Jacob</surname><given-names>MF</given-names> </name><name name-style="western"><surname>Saragiotto</surname><given-names>BT</given-names> </name></person-group><article-title>Assessing the performance of AI chatbots in answering patients&#x2019; common questions about low back pain</article-title><source>Ann Rheum Dis</source><year>2025</year><month>01</month><volume>84</volume><issue>1</issue><fpage>143</fpage><lpage>149</lpage><pub-id pub-id-type="doi">10.1136/ard-2024-226202</pub-id><pub-id pub-id-type="medline">39874229</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>T</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>R</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Evaluating the performance of state-of-the-art artificial intelligence chatbots based on the WHO global guidelines for the prevention of surgical site infection: cross-sectional study</article-title><source>J Med Internet Res</source><year>2025</year><month>07</month><day>31</day><volume>27</volume><fpage>e75567</fpage><pub-id pub-id-type="doi">10.2196/75567</pub-id><pub-id pub-id-type="medline">40744114</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mori</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Izumiyama</surname><given-names>T</given-names> </name><name name-style="western"><surname>Kanabuchi</surname><given-names>R</given-names> </name><name name-style="western"><surname>Mori</surname><given-names>N</given-names> </name><name name-style="western"><surname>Aizawa</surname><given-names>T</given-names> </name></person-group><article-title>Large language model may assist diagnosis of SAPHO syndrome by bone scintigraphy</article-title><source>Mod Rheumatol</source><year>2024</year><month>08</month><day>20</day><volume>34</volume><issue>5</issue><fpage>1043</fpage><lpage>1046</lpage><pub-id pub-id-type="doi">10.1093/mr/road115</pub-id><pub-id pub-id-type="medline">38153762</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>LT</given-names> </name><name name-style="western"><surname>Sinkler</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Adelstein</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Voos</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Calcei</surname><given-names>JG</given-names> </name></person-group><article-title>ChatGPT responses to common questions about anterior cruciate ligament reconstruction are frequently satisfactory</article-title><source>Arthroscopy</source><year>2024</year><month>07</month><volume>40</volume><issue>7</issue><fpage>2058</fpage><lpage>2066</lpage><pub-id pub-id-type="doi">10.1016/j.arthro.2023.12.009</pub-id><pub-id pub-id-type="medline">38171421</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chou</surname><given-names>R</given-names> </name></person-group><article-title>Low back pain</article-title><source>Ann Intern Med</source><year>2021</year><month>08</month><volume>174</volume><issue>8</issue><fpage>ITC113</fpage><lpage>ITC128</lpage><pub-id pub-id-type="doi">10.7326/AITC202108170</pub-id><pub-id pub-id-type="medline">34370518</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Knezevic</surname><given-names>NN</given-names> </name><name name-style="western"><surname>Candido</surname><given-names>KD</given-names> </name><name name-style="western"><surname>Vlaeyen</surname><given-names>JWS</given-names> </name><name name-style="western"><surname>Van Zundert</surname><given-names>J</given-names> </name><name name-style="western"><surname>Cohen</surname><given-names>SP</given-names> </name></person-group><article-title>Low back pain</article-title><source>Lancet</source><year>2021</year><month>07</month><day>3</day><volume>398</volume><issue>10294</issue><fpage>78</fpage><lpage>92</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(21)00733-9</pub-id><pub-id pub-id-type="medline">34115979</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mesk&#x00F3;</surname><given-names>B</given-names> </name></person-group><article-title>Prompt engineering as an important emerging skill for medical professionals: tutorial</article-title><source>J Med Internet Res</source><year>2023</year><month>10</month><day>4</day><volume>25</volume><fpage>e50638</fpage><pub-id pub-id-type="doi">10.2196/50638</pub-id><pub-id pub-id-type="medline">37792434</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Shi</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Jeon</surname><given-names>CO</given-names> </name><etal/></person-group><article-title>ChatGPT and generative AI are revolutionizing the scientific community: a janus-faced conundrum</article-title><source>Imeta</source><year>2024</year><month>04</month><volume>3</volume><issue>2</issue><fpage>e178</fpage><pub-id pub-id-type="doi">10.1002/imt2.178</pub-id><pub-id pub-id-type="medline">38882492</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Maaz</surname><given-names>S</given-names> </name><name name-style="western"><surname>Palaganas</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Palaganas</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bajwa</surname><given-names>M</given-names> </name></person-group><article-title>A guide to prompt design: foundations and applications for healthcare simulationists</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1504532</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1504532</pub-id><pub-id pub-id-type="medline">39980724</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huo</surname><given-names>B</given-names> </name><name name-style="western"><surname>Boyle</surname><given-names>A</given-names> </name><name name-style="western"><surname>Marfo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Large language models for chatbot health advice studies: a systematic review</article-title><source>JAMA Netw Open</source><year>2025</year><month>02</month><day>3</day><volume>8</volume><issue>2</issue><fpage>e2457879</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.57879</pub-id><pub-id pub-id-type="medline">39903463</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carroll</surname><given-names>AN</given-names> </name><name name-style="western"><surname>Storms</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Malempati</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shanavas</surname><given-names>RV</given-names> </name><name name-style="western"><surname>Badarudeen</surname><given-names>S</given-names> </name></person-group><article-title>Generative artificial intelligence and prompt engineering: a primer for orthopaedic surgeons</article-title><source>JBJS Rev</source><year>2024</year><month>10</month><day>1</day><volume>12</volume><issue>10</issue><pub-id pub-id-type="doi">10.2106/JBJS.RVW.24.00122</pub-id><pub-id pub-id-type="medline">39361780</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Casazza</surname><given-names>BA</given-names> </name></person-group><article-title>Diagnosis and treatment of acute low back pain</article-title><source>Am Fam Physician</source><year>2012</year><month>02</month><day>15</day><volume>85</volume><issue>4</issue><fpage>343</fpage><lpage>350</lpage><pub-id pub-id-type="medline">22335313</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verhagen</surname><given-names>AP</given-names> </name><name name-style="western"><surname>Downie</surname><given-names>A</given-names> </name><name name-style="western"><surname>Popal</surname><given-names>N</given-names> </name><name name-style="western"><surname>Maher</surname><given-names>C</given-names> </name><name name-style="western"><surname>Koes</surname><given-names>BW</given-names> </name></person-group><article-title>Red flags presented in current low back pain guidelines: a review</article-title><source>Eur Spine J</source><year>2016</year><month>09</month><volume>25</volume><issue>9</issue><fpage>2788</fpage><lpage>2802</lpage><pub-id pub-id-type="doi">10.1007/s00586-016-4684-0</pub-id><pub-id pub-id-type="medline">27376890</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kunze</surname><given-names>KN</given-names> </name><name name-style="western"><surname>Varady</surname><given-names>NH</given-names> </name><name name-style="western"><surname>Mazzucco</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The large language model ChatGPT-4 exhibits excellent triage capabilities and diagnostic performance for patients presenting with various causes of knee pain</article-title><source>Arthroscopy</source><year>2025</year><month>05</month><volume>41</volume><issue>5</issue><fpage>1438</fpage><lpage>1447</lpage><pub-id pub-id-type="doi">10.1016/j.arthro.2024.06.021</pub-id><pub-id pub-id-type="medline">38925234</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pagano</surname><given-names>S</given-names> </name><name name-style="western"><surname>Strumolo</surname><given-names>L</given-names> </name><name name-style="western"><surname>Michalk</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Evaluating ChatGPT, Gemini and other large language models (LLMs) in orthopaedic diagnostics: a prospective clinical study</article-title><source>Comput Struct Biotechnol J</source><year>2025</year><volume>28</volume><fpage>9</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.1016/j.csbj.2024.12.013</pub-id><pub-id pub-id-type="medline">39850460</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pagano</surname><given-names>S</given-names> </name><name name-style="western"><surname>Holzapfel</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kappenschneider</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Arthrosis diagnosis and treatment recommendations in clinical practice: an exploratory investigation with the generative AI model GPT-4</article-title><source>J Orthop Traumatol</source><year>2023</year><month>11</month><day>28</day><volume>24</volume><issue>1</issue><fpage>61</fpage><pub-id pub-id-type="doi">10.1186/s10195-023-00740-4</pub-id><pub-id pub-id-type="medline">38015298</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rajkumar</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Dimopoulos</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Palumbo</surname><given-names>A</given-names> </name><etal/></person-group><article-title>International myeloma working group updated criteria for the diagnosis of multiple myeloma</article-title><source>Lancet Oncol</source><year>2014</year><month>11</month><volume>15</volume><issue>12</issue><fpage>e538</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1016/S1470-2045(14)70442-5</pub-id><pub-id pub-id-type="medline">25439696</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cowan</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Green</surname><given-names>DJ</given-names> </name><name name-style="western"><surname>Kwok</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Diagnosis and management of multiple myeloma: a review</article-title><source>JAMA</source><year>2022</year><month>02</month><day>1</day><volume>327</volume><issue>5</issue><fpage>464</fpage><lpage>477</lpage><pub-id pub-id-type="doi">10.1001/jama.2022.0003</pub-id><pub-id pub-id-type="medline">35103762</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abbasian</surname><given-names>M</given-names> </name><name name-style="western"><surname>Khatibi</surname><given-names>E</given-names> </name><name name-style="western"><surname>Azimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Foundation metrics for evaluating effectiveness of healthcare conversations powered by generative AI</article-title><source>NPJ Digit Med</source><year>2024</year><month>03</month><day>29</day><volume>7</volume><issue>1</issue><fpage>82</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01074-z</pub-id><pub-id pub-id-type="medline">38553625</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Segal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Daher</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Artificial intelligence versus orthopedic surgeons as an orthopedic consultant in the emergency department</article-title><source>Injury</source><year>2025</year><month>04</month><volume>56</volume><issue>4</issue><fpage>112297</fpage><pub-id pub-id-type="doi">10.1016/j.injury.2025.112297</pub-id><pub-id pub-id-type="medline">40147063</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ferdinandov</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yankov</surname><given-names>D</given-names> </name><name name-style="western"><surname>Trandzhiev</surname><given-names>M</given-names> </name></person-group><article-title>Common differential diagnosis of low back pain in contemporary medical practice: a narrative review</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1366514</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1366514</pub-id><pub-id pub-id-type="medline">38379555</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Urits</surname><given-names>I</given-names> </name><name name-style="western"><surname>Burshtein</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sharma</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Low back pain, a comprehensive review: pathophysiology, diagnosis, and treatment</article-title><source>Curr Pain Headache Rep</source><year>2019</year><month>03</month><day>11</day><volume>23</volume><issue>3</issue><fpage>23</fpage><pub-id pub-id-type="doi">10.1007/s11916-019-0757-1</pub-id><pub-id pub-id-type="medline">30854609</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Mizuta</surname><given-names>K</given-names> </name><name name-style="western"><surname>Sakamoto</surname><given-names>T</given-names> </name><name name-style="western"><surname>Tokumasu</surname><given-names>K</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Diagnostic performance of generative artificial intelligences for a series of complex case reports</article-title><source>Digit Health</source><year>2024</year><volume>10</volume><fpage>20552076241265215</fpage><pub-id pub-id-type="doi">10.1177/20552076241265215</pub-id><pub-id pub-id-type="medline">39229463</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Esmaeilzadeh</surname><given-names>P</given-names> </name></person-group><article-title>Generative AI in medical practice: in-depth exploration of privacy and security challenges</article-title><source>J Med Internet Res</source><year>2024</year><month>03</month><day>8</day><volume>26</volume><fpage>e53008</fpage><pub-id pub-id-type="doi">10.2196/53008</pub-id><pub-id pub-id-type="medline">38457208</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rosic</surname><given-names>A</given-names> </name></person-group><article-title>Legal implications of artificial intelligence in health care</article-title><source>Clin Dermatol</source><year>2024</year><volume>42</volume><issue>5</issue><fpage>451</fpage><lpage>459</lpage><pub-id pub-id-type="doi">10.1016/j.clindermatol.2024.06.014</pub-id><pub-id pub-id-type="medline">38936641</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wells</surname><given-names>BJ</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>HM</given-names> </name><name name-style="western"><surname>McWilliams</surname><given-names>A</given-names> </name><etal/></person-group><article-title>A practical framework for appropriate implementation and review of artificial intelligence (FAIR-AI) in healthcare</article-title><source>NPJ Digit Med</source><year>2025</year><month>08</month><day>11</day><volume>8</volume><issue>1</issue><fpage>514</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01900-y</pub-id><pub-id pub-id-type="medline">40790350</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hassan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kushniruk</surname><given-names>A</given-names> </name><name name-style="western"><surname>Borycki</surname><given-names>E</given-names> </name></person-group><article-title>Barriers to and facilitators of artificial intelligence adoption in health care: scoping review</article-title><source>JMIR Hum Factors</source><year>2024</year><month>08</month><day>29</day><volume>11</volume><fpage>e48633</fpage><pub-id pub-id-type="doi">10.2196/48633</pub-id><pub-id pub-id-type="medline">39207831</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoppe</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Auer</surname><given-names>MK</given-names> </name><name name-style="western"><surname>Str&#x00FC;ven</surname><given-names>A</given-names> </name><name name-style="western"><surname>Massberg</surname><given-names>S</given-names> </name><name name-style="western"><surname>Stremmel</surname><given-names>C</given-names> </name></person-group><article-title>ChatGPT with GPT-4 outperforms emergency department physicians in diagnostic accuracy: retrospective analysis</article-title><source>J Med Internet Res</source><year>2024</year><month>07</month><day>8</day><volume>26</volume><fpage>e56110</fpage><pub-id pub-id-type="doi">10.2196/56110</pub-id><pub-id pub-id-type="medline">38976865</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Daneshjou</surname><given-names>R</given-names> </name><name name-style="western"><surname>Smith</surname><given-names>MP</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>MD</given-names> </name><name name-style="western"><surname>Rotemberg</surname><given-names>V</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name></person-group><article-title>Lack of transparency and potential bias in artificial intelligence data sets and algorithms: a scoping review</article-title><source>JAMA Dermatol</source><year>2021</year><month>11</month><day>1</day><volume>157</volume><issue>11</issue><fpage>1362</fpage><lpage>1369</lpage><pub-id pub-id-type="doi">10.1001/jamadermatol.2021.3129</pub-id><pub-id pub-id-type="medline">34550305</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arora</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alderman</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Palmer</surname><given-names>J</given-names> </name><etal/></person-group><article-title>The value of standards for health datasets in artificial intelligence-based applications</article-title><source>Nat Med</source><year>2023</year><month>11</month><volume>29</volume><issue>11</issue><fpage>2929</fpage><lpage>2938</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02608-w</pub-id><pub-id pub-id-type="medline">37884627</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Preiksaitis</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ashenburg</surname><given-names>N</given-names> </name><name name-style="western"><surname>Bunney</surname><given-names>G</given-names> </name><etal/></person-group><article-title>The role of large language models in transforming emergency medicine: scoping review</article-title><source>JMIR Med Inform</source><year>2024</year><month>05</month><day>10</day><volume>12</volume><fpage>e53787</fpage><pub-id pub-id-type="doi">10.2196/53787</pub-id><pub-id pub-id-type="medline">38728687</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pham</surname><given-names>JH</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model triaging of simulated nephrology patient inbox messages</article-title><source>Front Artif Intell</source><year>2024</year><volume>7</volume><fpage>1452469</fpage><pub-id pub-id-type="doi">10.3389/frai.2024.1452469</pub-id><pub-id pub-id-type="medline">39315245</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Preadjudication interrater agreement between 2 surgeons in the screened cohort.</p><media xlink:href="jmir_v28i1e92315_app1.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>Operational diagnostic criteria used for case inclusion and expert reference adjudication.</p><media xlink:href="jmir_v28i1e92315_app2.docx" xlink:title="DOCX File, 15 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>The main features and default inference parameters of the 2 state-of-the-art LLMs used in this study.</p><media xlink:href="jmir_v28i1e92315_app3.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4</label><p>Structured Prompts of the large language models (LLMs) (DeepSeek V3.2 and ChatGPT 5.1) and a complete example.</p><media xlink:href="jmir_v28i1e92315_app4.docx" xlink:title="DOCX File, 679 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5</label><p>Patient History Structured Questionnaire.</p><media xlink:href="jmir_v28i1e92315_app5.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6</label><p>Representative scoring examples for the groundedness domain and scoring anchors for the harm domain.</p><media xlink:href="jmir_v28i1e92315_app6.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app7"><label>Multimedia Appendix 7</label><p>Demographic information of included patients.</p><media xlink:href="jmir_v28i1e92315_app7.docx" xlink:title="DOCX File, 12 KB"/></supplementary-material><supplementary-material id="app8"><label>Multimedia Appendix 8</label><p>Comparison of the triage accuracy of the large language models (LLMs) for low back pain.</p><media xlink:href="jmir_v28i1e92315_app8.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app9"><label>Multimedia Appendix 9</label><p>Comparison of the preliminary diagnosis accuracy of the large language models (LLMs) for low back pain.</p><media xlink:href="jmir_v28i1e92315_app9.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material><supplementary-material id="app10"><label>Multimedia Appendix 10</label><p>Comparison of the differential diagnosis agreement of the large language models (LLMs) for low back pain.</p><media xlink:href="jmir_v28i1e92315_app10.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app11"><label>Multimedia Appendix 11</label><p>Interrater agreements for performance evaluation of the large language models (LLMs).</p><media xlink:href="jmir_v28i1e92315_app11.docx" xlink:title="DOCX File, 14 KB"/></supplementary-material><supplementary-material id="app12"><label>Multimedia Appendix 12</label><p>Comparison of the rated model rationale&#x2019;s evaluation of large language models for low back pain.</p><media xlink:href="jmir_v28i1e92315_app12.docx" xlink:title="DOCX File, 13 KB"/></supplementary-material><supplementary-material id="app13"><label>Multimedia Appendix 13</label><p>Number of safety-risk cases in 4 prespecified red-flag conditions by model and phase.</p><media xlink:href="jmir_v28i1e92315_app13.docx" xlink:title="DOCX File, 1456 KB"/></supplementary-material><supplementary-material id="app14"><label>Checklist 1</label><p>TRIPOD-LLM checklist.</p><media xlink:href="jmir_v28i1e92315_app14.pdf" xlink:title="PDF File, 142 KB"/></supplementary-material></app-group></back></article>