<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e69929</article-id><article-id pub-id-type="doi">10.2196/69929</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Performance of ChatGPT-4o and Four Open-Source Large Language Models in Generating Diagnoses Based on China&#x2019;s Rare Disease Catalog: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zhong</surname><given-names>Wei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>YiFan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Yan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Kai</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gao</surname><given-names>HuiMin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yan</surname><given-names>HuiHui</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hao</surname><given-names>WenJing</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yan</surname><given-names>YouSheng</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Yin</surname><given-names>ChengHong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1"/><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Prenatal Diagnosis, Beijing Obstetrics and Gynecology Hospital, Capital Medical University, Beijing Maternal and Child Health Care Hospital</institution><addr-line>No. 251 Yaojiayuan Road, Chaoyang District</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Schwartz</surname><given-names>Amy</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Patel</surname><given-names>Dhavalkumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Chrimes</surname><given-names>Dillon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Modersohn</surname><given-names>Luise</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Myers</surname><given-names>Skatje</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to ChengHong Yin, MD, Department of Prenatal Diagnosis, Beijing Obstetrics and Gynecology Hospital, Capital Medical University, Beijing Maternal and Child Health Care Hospital, No. 251 Yaojiayuan Road, Chaoyang District, Beijing, China, 8618810963279; <email>yinchh@ccmu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>18</day><month>6</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e69929</elocation-id><history><date date-type="received"><day>11</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>04</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>04</day><month>05</month><year>2025</year></date></history><copyright-statement>&#x00A9; Wei Zhong, YiFan Liu, Yan Liu, Kai Yang, HuiMin Gao, HuiHui Yan, WenJing Hao, YouSheng Yan, ChengHong Yin. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 18.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e69929"/><abstract><sec><title>Background</title><p>Diagnosing rare diseases remains challenging due to their inherent complexity and limited physician knowledge. Large language models (LLMs) offer new potential to enhance diagnostic workflows.</p></sec><sec><title>Objective</title><p>This study aimed to evaluate the diagnostic accuracy of ChatGPT-4o and 4 open-source LLMs (qwen2.5:7b, Llama3.1:8b, qwen2.5:72b, and Llama3.1:70b) for rare diseases, assesses the language effect on diagnostic performance, and explore retrieval augmented generation (RAG) and chain-of-thought (CoT) reasoning.</p></sec><sec sec-type="methods"><title>Methods</title><p>We extracted clinical manifestations of 121 rare diseases from China&#x2019;s inaugural rare disease catalog. ChatGPT-4o generated a primary and 5 differential diagnoses, while 4 LLMs were assessed in both English and Chinese contexts. The lowest-performing model underwent RAG and CoT re-evaluation. Diagnostic accuracy was compared via the McNemar test. A survey evaluated 11 clinicians&#x2019; familiarity with rare diseases.</p></sec><sec sec-type="results"><title>Results</title><p>ChatGPT-4o demonstrated the highest diagnostic accuracy with 90.1%. Language effects varied across models: qwen2.5:7b showed comparable performance in Chinese (51.2%) and English (47.9%; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=0.32, <italic>P</italic>=.57), whereas Llama3.1:8b exhibited significantly higher English accuracy (67.8% vs 31.4%; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=40.20, <italic>P</italic>&#x003C;.001). Among larger models, qwen2.5:72b maintained cross-lingual consistency considering the odds ratio (OR; Chinese: 82.6% vs English: 83.5%; OR 0.88, 95% CI 0.27-2.76,<italic>P</italic>=1.000), contrasting with Llama3.1:70b&#x2019;s language-dependent variation (Chinese: 80.2% vs English: 90.1%; OR 0.29,95% CI 0.08-0.83, <italic>P</italic>=.02). Cross-model comparisons revealed Llama3.1:8b underperformed qwen2.5:7b in Chinese (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=13.22,<italic>P</italic>&#x003C;.001) but surpassed it in English (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=13.92,<italic>P</italic>&#x003C;.001). No significant differences were observed between qwen2.5:72b and Llama3.1:70b (English: OR 0.33, <italic>P</italic>=.08; Chinese: OR 1.5, 95% CI 0.48-5.12,<italic>P</italic>=.07); qwen2.5:72b matched ChatGPT-4o&#x2019;s performance in both languages (English: OR 0.33, <italic>P</italic>=.08; Chinese: OR 0.44, <italic>P</italic>=.09); Llama3.1:70b mirrored ChatGPT-4o&#x2019;s English accuracy (OR 1, <italic>P</italic>=1.000) but lagged in Chinese (OR 0.33; <italic>P</italic>=.02). RAG implementation enhanced qwen2.5:7b&#x2019;s accuracy to 79.3% (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=31.11, <italic>P</italic>&#x003C;.001) with 85.9% retrieval precision. The distilled model Deepseek-R1:7b markedly underperformed (9.9% vs qwen2.5:7b; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=42.19, <italic>P</italic>&#x003C;.001). Clinician surveys revealed significant knowledge gaps in rare disease management.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>ChatGPT-4o demonstrated superior diagnostic performance for rare diseases. While Llama3.1:8b demonstrates viability for localized deployment in resource-constrained English diagnostic workflows, Chinese applications require larger models to achieve comparable diagnostic accuracy. This urgency is heightened by the release of open-source models like DeepSeek-R1, which may see rapid adoption without thorough validation. Successful clinical implementation of LLMs requires 3 core elements: model parameterization, user language, and pretraining data. The integration of RAG significantly enhanced open-source LLM accuracy for rare disease diagnosis, although caution remains warranted for low-parameter reasoning models showing substantial performance limitations. We recommend hospital IT departments and policymakers prioritize language relevance in model selection and consider integrating RAG with curated knowledge bases to enhance diagnostic utility in constrained settings, while exercising caution with low-parameter models.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>ChatGPT</kwd><kwd>rare diseases</kwd><kwd>Llama</kwd><kwd>open-source LLMs</kwd><kwd>retrieval augmented generation</kwd><kwd>chain-of-thought</kwd><kwd>Deepseek</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Although the prevalence of individual rare diseases is low, their collective impact on the global population is considerable due to their vast diversity and number [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. These conditions are primarily genetic in origin, characterized by limited treatment options and substantial financial burdens [<xref ref-type="bibr" rid="ref3">3</xref>]. The diagnosis remains a complex and prolonged process [<xref ref-type="bibr" rid="ref4">4</xref>], with patients often enduring a diagnostic odyssey averaging 5 years before receiving a conclusive diagnosis [<xref ref-type="bibr" rid="ref1">1</xref>]. This prolonged uncertainty exacerbates patient distress and strain health care systems. While whole-exome sequencing has proven to be an efficacious diagnostic tool for most rare diseases in clinical practice, high cost and specialized expertise requirements limit its widespread use [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. Alternatively, phenotype-driven diagnostic approaches present a more affordable and expedient solution, although their reliance on precise phenotypic terminology poses challenges for both patients and clinicians [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>The recent advancements of large language models (LLMs) have expanded their applications in medical diagnostics [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. With comprehensive medical knowledge, these models serve as diagnostic aids through natural language interactions [<xref ref-type="bibr" rid="ref11">11</xref>], demonstrating promising capabilities in rare diseases diagnosis [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. While clinicians are experienced in diagnosing common conditions, they often lack proficiency in managing the complex nature of rare diseases [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref14">14</xref>]. Consequently, integrating LLMs into the diagnostic workflow for rare diseases holds significant clinical value, especially for patients using these tools for self-diagnosis [<xref ref-type="bibr" rid="ref12">12</xref>]. Existing studies have focused on specific rare diseases or limited disease groups [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. However, assessing LLM performance across all rare diseases is challenging and impractical. Given geographic variations in rare disease prevalence [<xref ref-type="bibr" rid="ref2">2</xref>], prioritizing regionally prevalent rare diseases for LLM development enhances diagnostic efficiency and improves local health care outcomes.</p><p>To advance rare disease management, China has established a national catalog of 207 rare diseases selected through established criteria including incidence, severity, and diagnostic feasibility. However, phenotype-based preliminary diagnosis remains challenging, with even specialists requiring guidance from senior clinicians [<xref ref-type="bibr" rid="ref5">5</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Evaluating LLMs&#x2019; diagnostic capabilities within this framework holds critical value, enabling both assessment of clinical applicability in Chinese populations and generation of transferable insights for global health systems.</p><p>Furthermore, LLM performance variations stem from diverse architectures, parameter scales, and language configurations [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>]. While commercial models excel in accuracy, open-source alternatives gain traction through local deployment advantages that safeguard patient privacy&#x2014;a critical consideration given modern models&#x2019; ability to process sensitive data like medical images [<xref ref-type="bibr" rid="ref12">12</xref>]. This necessitates rigorous evaluation of open-source models&#x2019; diagnostic suitability, particularly for rare diseases. Current research gaps persist, with limited studies on open-source LLMs for rare disease diagnosis and no systematic cross-linguistic evaluations addressing language, model parameters, and regional contexts. Such assessments are essential for optimizing LLM adoption in primary care settings and non-English health care systems, enabling evidence-based model selection aligned with local needs.</p><p>Open-source LLMs achieve domain competence through specialized fine-tuning, although their performance remains constrained by base architecture limitations [<xref ref-type="bibr" rid="ref19">19</xref>]. Retrieval-augmented generation (RAG) enhances diagnostic accuracy by integrating domain-specific knowledge bases, effectively reducing hallucinations [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>], yet its application in rare disease diagnostics remains underexplored, hindering clinical implementation.</p><p>The scalable DeepSeek-R1 architecture (1.5B-671B parameters) advances medical LLM development [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. While its chain-of-thought (CoT) reasoning succeeds in general cognitive tasks, clinical validation for rare disease diagnosis , requiring specialized reasoning patterns, remains lacking.</p><p>This 3-phase investigation first evaluates ChatGPT-4o&#x2019;s diagnostic accuracy using clinical manifestations from China&#x2019;s rare disease catalog. We then assess 4 open-source LLMs (Chinese and United States&#x2013;developed models with different parameters) in bilingual contexts (Chinese and English) to quantify language effects. Concurrently, we validate RAG and CoT for capability enhancement. Finally, clinician surveys across specialties (n=11) measure rare disease knowledge gaps, informing LLM implementation needs. This multilevel assessment aims to (1) determine clinical applicability of LLMs, (2) establish model selection criteria (parameters, language, and origin), and (3) guide development of region-specific diagnostic tools.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Diagnostic Trial Study Design</title><p>We evaluated LLM diagnostic performance through 3 sequential phases: (1) baseline establishment: ChatGPT-4o&#x2019;s English performance as commercial reference standard. (2) Open-source model assessment: cross-linguistic accuracy (Chinese and English queries), parameter scaling effects (7b to 72b variants), architecture comparison (Chinese vs United States&#x2013;developed models). (3) Capability enhancement: controlled RAG and CoT testing. This framework addresses 3 performance determinants (parametric scale, language alignment, and developmental origin), with the goal of resolving the diagnostic performance disparity between open-source and commercial LLMs in rare disease contexts.</p></sec><sec id="s2-2"><title>Data Source and Collection for China&#x2019;s Rare Disease Catalog</title><p>The National Health Commission of China has published the Rare Disease Catalog, which currently includes 207 conditions that meet 4 mandatory criteria: (1) low prevalence (&#x003C;1/500,000 or neonatal morbidity&#x003C;1/10,000), (2) severe health impact, (3) established diagnostic protocols, and (4) actionable treatment pathways. The catalog can be updated only after a period of at least 2 years, with the inaugural version in 2018 listing 121 diseases and the 2023 update adding 86 new entries.</p><p>Clinical phenotypes were extracted from the National Rare Disease Registry System (NRDRS) [<xref ref-type="bibr" rid="ref26">26</xref>], a centralized platform managed by Peking Union Medical College Hospital, integrating data from 107 collaborating institutions. As of 2025, the NRDRS contains containing 92,600 cases with demographic, diagnostic, therapeutic, and survival parameters from 107 institutions.</p><p>For LLM evaluation, we systematically curated deidentified symptom descriptions from NRDRS, removing disease-specific identifiers (nosology, pathogenic variants, and subtype classifications) to simulate patient narratives. Given genetic basis of most cataloged diseases, and their diagnostic criteria typically hinge on genetic testing rather than clinical symptoms or auxiliary diagnostic tests, we have designated genetic testing as a key criterion for classifying diseases as genetic within the catalog, in anticipation of subsequent analyses.</p><p>The clinical manifestations of all diseases were translated from Chinese to English using DeepL&#x2019;s web interface [<xref ref-type="bibr" rid="ref27">27</xref>], with each disease processed as an independent translation unit. Translated outputs underwent manual review by bilingual researchers to ensure completeness and preserve semantic accuracy, particularly for critical diagnostic descriptors. Verified translations preserved the source text&#x2019;s original structure and were directly used as clinical case descriptions for LLM diagnostic evaluations in English.</p></sec><sec id="s2-3"><title>Diagnostic Flow</title><p>Using ChatGPT-4o&#x2019;s application programming interface, we prompted the model: &#x201C;As a doctor specializing in rare diseases, what kind of rare disease is most likely to be diagnosed based on the clinical manifestations of the disease I have provided you? In addition, 5 other possible diagnoses need to be listed.&#x201D; To prevent conversational confounders, we used single-turn queries with history truncation, requiring models to output primary diagnosis and ranked differentials (top 5) solely from input data.</p><p>We clarify that LLMs cannot definitively diagnose rare diseases based solely on symptoms (clinical manifestations are insufficient for confirmation). Since most rare diseases require genetic testing or laboratory confirmation, absence of expected diagnoses in outputs does not imply model error but reflects symptom overlap among diseases. Our objective was to assess whether models can prioritize catalog diseases from clinical prompts, mimicking clinicians&#x2019; preliminary diagnose. Correct diagnosis required inclusion of exact disease names, accepted synonyms, or broader categories in primary and differential lists. Two clinical physicians reviewed the outcomes of the model&#x2019;s outputs, and in cases of disagreement, a third physician was consulted to reach a consensus through thorough discussion.</p></sec><sec id="s2-4"><title>Selection of Open-Source LLMs</title><p>We evaluated 4 open-source LLMs: qwen2.5:7b/72b (Alibaba) and Llama3.1:8b/70b (Meta), representing diverse parameters and geographic origins. All models ran locally via the Ollama framework [<xref ref-type="bibr" rid="ref28">28</xref>] with a custom interface &#x201C;Chat&#x201D; [<xref ref-type="bibr" rid="ref29">29</xref>]. The models are configured with a temperature setting of 0.1 to ensure precision in outcomes, using prompt wording and diagnostic procedures analogous to ChatGPT-4o. Each model underwent sequential evaluation: English first, then Chinese.</p></sec><sec id="s2-5"><title>Statistical Analysis</title><p>ChatGPT-4o&#x2019;s accuracy was assessed via Pearson chi-square (<italic>&#x03C7;</italic>&#x00B2;) tests stratified by genetic status. In addition, 4 open-source models underwent bilingual testing (121 cases), generating 8 response sets. McNemar tests (<italic>&#x03C7;</italic>&#x00B2; if discordant pairs &#x2265;25; exact binomial otherwise) compared 36 model pairs plus 2 RAG and CoT interventions (38 comparisons). Total assessments (N=39) used Bonferroni-adjusted &#x03B1;=.001, and reported marginal findings (<italic>P</italic>&#x003C;.05) to reduce type II errors. Analyses used R (R Core Team) version 4.3.2.</p></sec><sec id="s2-6"><title>Construction of Rare Disease Knowledge Base and Application of RAG</title><p>Leveraging the outputs from ChatGPT-4o, we used the open-source framework Maxkb (GitHub version 1.7.2) [<xref ref-type="bibr" rid="ref30">30</xref>] to implement RAG for constructing a knowledge base specific to rare diseases. In addition, we did not correct the minority of diagnostic errors present in the ChatGPT-4o&#x2019;s conversational content, as the primary objective of engaging in RAG was to explore the precision of its retrieval capabilities. It was also of interest to observe whether open-source LLMs would adhere to the principle of accurate retrieval despite the presence of incorrect conclusions within the knowledge base.</p><p>Text segmentation used &#x201C;user&#x201D; as delimiter, generating 121 text blocks (726 entries) representing complete diagnostic responses. The retrieval process used MaxKB&#x2019;s proprietary embedding model (maxkb-embedding) for dense vector representations. For each query, we retrieved the top 1 most semantically relevant text block based on cosine similarity thresholding (&#x003E;0.6). The system imposed a 2144-character constraint per retrieved segment to ensure clinical context preservation. When no qualifying segments were identified, a standardized null-response protocol (&#x201C;There are no related contents.&#x201D;) was triggered. The model with the lowest-performing in English was chosen for RAG to assess whether its diagnostic capabilities would improve post-RAG integration. It is defined as correct retrieval when the primary diagnosis output by the generative model is consistent with the primary diagnosis provided in the retrieved knowledge base (regardless of whether the diagnosis is correct).</p></sec><sec id="s2-7"><title>CoT Model Selection Rationale</title><p>The DeepSeek-R1 architecture (671B parameters) was distilled into 2 CoT variants: DeepSeek-R1:7B (from qwen2.5:7B) and DeepSeek-R1:8B (from Llama3.1:8B). While these distilled models were not directly evaluated, their base models underwent full diagnostic benchmarking in this study. This experimental design enables comparative analysis between a base model and its distilled counterpart to assess CoT-mediated performance. We selected the lowest-performing English base model (qwen2.5:7B) for CoT testing, ensuring performance changes reflected CoT architecture.</p></sec><sec id="s2-8"><title>Questionnaire for China&#x2019;s Rare Disease Catalog</title><p>Initially aimed to compare physician diagnoses with LLM outputs. However, most surveyed diseases fell outside clinicians&#x2019; expertise. Moreover, in clinical practice, they rely heavily on whole-exome sequencing for diagnostics, suggesting that the accuracy of diagnoses for the diseases might be close to zero.</p><p>The revised survey assessed multidisciplinary specialists&#x2019; (prenatal diagnosticians, pediatricians, radiologists, etc) awareness of China&#x2019;s Rare Disease Catalog to evaluate the need for LLM-assisted diagnostics. The selection of physicians was not random but rather aimed at maximizing the inclusion of professionals from relevant fields that our research team had access to, ensuring a diverse representation of perspectives.</p></sec><sec id="s2-9"><title>Ethical Considerations</title><p>This study used publicly available web-based textual data that did not contain any specific patient information. As a simulation of clinical diagnostic trials without intervention on real patients, this study complies with the institutional guidelines for ethics committee exemption.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Diagnostic Accuracy of ChatGPT-4o</title><p><xref ref-type="fig" rid="figure1">Figure 1</xref> illustrates the methodological workflow encompassing all analytical phases. English-translated clinical cases (n=121) were evaluated (full outputs: <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref> and <xref ref-type="supplementary-material" rid="app2">2</xref>). The outputs comprised 86.78% genetic diseases and 12.22% nongenetic diseases, highlighting the genetic preponderance in China&#x2019;s rare disease registry (<xref ref-type="table" rid="table1">Table 1</xref>). ChatGPT-4o achieved an overall accuracy of 90.1%, with no significant difference in performance between genetic and nongenetic diseases. Among correct cases, 13.8% (15/109) were listed as differentials.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Comprehensive design flowchart of the study. ChatGPT-4o provided diagnostic assessments for 121 diseases in both languages. In parallel, 4 open-source large language models (LLMs) generated 8 response sets (4 models&#x00D7;2 languages); chain-of-thought integration with large language models outputs yielded 11 evaluation groups. CoT: chain-of-thought; LLM: large language model; RAG: retrieval augmented generation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e69929_fig01.png"/></fig><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>ChatGPT-4o diagnosis of 121 diseases in the China&#x2019;s first catalog of rare diseases.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom" colspan="2">Diagnosis, n (%)</td><td align="left" valign="bottom">Total, n (%)</td><td align="left" valign="bottom">95% CI</td></tr><tr><td align="left" valign="top"/><td align="left" valign="top">Accuracy</td><td align="left" valign="top">Misdiagnosis</td><td align="left" valign="top"/><td align="left" valign="top"/></tr></thead><tbody><tr><td align="left" valign="top">Genetic<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">94 (89.5)</td><td align="left" valign="top">11 (10.5)</td><td align="left" valign="top">105 (86.8)</td><td align="left" valign="top" rowspan="2">0.83&#x2010;1.10</td></tr><tr><td align="left" valign="top">Nongenetic<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">15 (93.8)</td><td align="left" valign="top">1 (6.3)</td><td align="left" valign="top">16 (13.2)</td></tr><tr><td align="left" valign="top">Total</td><td align="left" valign="top">109 (90.1)</td><td align="left" valign="top">12 (9.9)</td><td align="left" valign="top">121 (100)</td><td align="left" valign="top"/></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup><italic>&#x03C7;</italic><sup>2<sub>1</sub></sup>: 0.28.</p></fn><fn id="table1fn2"><p><sup>b</sup><italic>P</italic>=.60.</p></fn><fn id="table1fn3"><p><sup>c</sup>Relative risk (RR)=0.96.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Diagnosis of Rare Diseases by 4 OpenSource LLMs in Chinese and English</title><sec id="s3-2-1"><title>Results of Diagnosis</title><p>Furthermore, 4 open-source LLMs diagnosed 121 rare diseases in both English and Chinese. Their performance, as illustrated in <xref ref-type="fig" rid="figure2">Figure 2</xref>, revealed notable variations across models and languages.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Correct number and proportion of cases by 4 open-source large language models in English and Chinese. Bilingual accuracy patterns reveal model-specific language dependencies, with Chinese qwen2.5 models demonstrating linguistic invariance contrasting with US Llama3.1 models&#x2019; English-dominant profiles.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e69929_fig02.png"/></fig></sec><sec id="s3-2-2"><title>Performance in English</title><p>The LLMs exhibited a range of diagnostic capabilities, with Llama3.1:70b achieving the highest accuracy, followed by qwen2.5:72b, Llama3.1:8b, and qwen2.5:7b. Among the correct diagnoses, some were identified through differential diagnosis: qwen2.5:7b resolved 14 cases (24.1%), Llama3.1:8b resolved 15 cases (18.1%), qwen2.5:72b resolved 12 cases (11.9%), and Llama3.1:70b resolved 14 cases (12.8%). Notably, Llama3.1:8b initially refused to respond to one specific case (congenital hyperinsulinemic hypoglycemia) but provided a diagnosis upon re-engagement with identical settings.</p></sec><sec id="s3-2-3"><title>Performance in Chinese</title><p>Qwen2.5:72b achieved the highest number of correct diagnoses, closely approaching Llama3.1:70b, followed by qwen2.5:7b, and finally Llama3.1:8b.</p><p>Moreover, qwen2.5 models (7b and 72b) demonstrated consistent performance across languages, maintaining strong diagnostic accuracy. In contrast, the Llama3.1 models (8b and 70b) showed reduced accuracy in Chinese compared with English, highlighting language-specific limitations. For differential diagnoses, qwen2.5:7b identified 11 cases (17.74%), Llama3.1:8b identified 6 cases (15.8%), qwen2.5:72b identified 10 cases (10%), and Llama3.1:70b identified 9 cases (9.3%). In addition, Llama3.1:70b frequently defaulted to English responses, requiring prompt adjustments to elicit Chinese outputs.</p><p>The analysis reveals significant language-mediated accuracy variations in LLM diagnostics. Chinese qwen2.5 models (7b/72b) showed cross-lingual consistency, whereas US Llama3.1 models (8b/70b) demonstrated language-specific performance, with English accuracy surpassing Chinese baselines.</p></sec></sec><sec id="s3-3"><title>Multiple McNemar Tests for Diagnostic Accuracy of LLMs</title><p>ChatGPT-4o and 4 open-source LLMs generated 9 response sets. A total number of 36 McNemar test comparisons across the sets revealed distinct patterns (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). For language effects, qwen2.5:7b showed comparable Chinese (51.2%) and English accuracy (47.9%; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=0.32, <italic>P</italic>=.57) and Llama3.1:8b exhibited English superiority (67.8% vs 31.4%; <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=40.20, <italic>P</italic>&#x003C;.001). Larger models displayed architectural divergence where qwen2.5:72b maintained cross-lingual consistency (82.6% vs 83.5%; OR 0.88, <italic>P</italic>=1.000), while Llama3.1:70b showed language-dependent performance (80.2% vs 90.1%; OR 0.29, <italic>P</italic>=.02, marginally significant).</p><p>Cross-model comparisons (<xref ref-type="fig" rid="figure3">Figure 3</xref>) revealed Llama3.1:8b underperformed qwen2.5:7b in Chinese (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=13.22, <italic>P</italic>&#x003C;.001) but excelled in English (<italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=13.92, <italic>P</italic>&#x003C;.001) No significant differences between qwen2.5:72b and Llama3.1:70b (English: OR 0.33, <italic>P</italic>=.08; Chinese: OR 1.5, <italic>P</italic>=.07) were found. qwen2.5:72b matched ChatGPT-4o&#x2019;s performance in both languages (English: OR 0.33, <italic>P</italic>=.08; Chinese: OR 0.44, <italic>P</italic>=.09), and Llama3.1:70b mirrored ChatGPT-4o&#x2019;s English accuracy (OR 1, <italic>P</italic>=1) but lagged in Chinese (OR 0.33; <italic>P</italic>=.02; marginally significant). Moreover, diagnostic accuracy scaled positively with parameter count across both languages (<xref ref-type="fig" rid="figure3">Figures 3</xref> and <xref ref-type="fig" rid="figure4">4</xref>).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Cross-model diagnostic accuracy comparisons using McNemar tests. Performance evaluation based on rare disease, with models tested in Chinese and English. Significance notation: non-significant (<italic>P</italic>&#x2265;.05); *marginally significant (<italic>P</italic>&#x003C;.05) ;**statistically significant (<italic>P</italic>&#x003C;.001 post-Bonferroni correction).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e69929_fig03.png"/></fig><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Diagnostic accuracy comparison matrix. Heat map visualization of cross-model <italic>P</italic> values (n=36 comparisons). Language identifiers: English and Chinese. Diagonal: identical dataset pairs (noncomparisons). Symmetric mirroring: Upper/lower triangle equivalence. Color encoding: darker hues indicate stronger significance.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e69929_fig04.png"/></fig></sec><sec id="s3-4"><title>Enhancing Diagnostic Accuracy for Rare Diseases With RAG Technology</title><p>Among the evaluated models, qwen2.5:7b showed the lowest English diagnostic accuracy. Implementing RAG with a knowledge base built from ChatGPT-4o&#x2019;s diagnostic outputs significantly improved its performance: accuracy increased to 79.3% (&#x0394;+31.4% from baseline; <xref ref-type="table" rid="table2">Table 2</xref>), with qwen2.5:7b achieving 85.9% retrieval precision under imperfect knowledge bases. Theoretically revealed data suggests that RAG-enhanced accuracy could reach 85.9% with perfect retrieval. This is comparable to ChatGPT-4o&#x2019;s benchmark of 90.1% when using identical clinical case inputs.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Diagnostic performance of qwen2.5:7B with retrieval augmented generation intervention.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Before RAG<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">After RAG, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Correct diagnosis<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">58 (47.9)</td><td align="left" valign="top">96 (79.3)</td></tr><tr><td align="left" valign="top">Incorrect diagnosis<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="top">63 (52.1)</td><td align="left" valign="top">25 (20.7)</td></tr><tr><td align="left" valign="top">Correct retrieval</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td><td align="left" valign="top">104 (85.9)</td></tr><tr><td align="left" valign="top">Incorrect retrieval</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">17 (14.1)</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>RAG: retrieval augmented generation.</p></fn><fn id="table2fn2"><p><sup>b</sup><italic>X</italic><sup>2</sup><sub>1</sub>=31.11.</p></fn><fn id="table2fn3"><p><sup>c</sup><italic>P</italic> value &#x003C;.001.</p></fn><fn id="table2fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>For disease 35 (glycogen storage disease), qwen2.5:7b incorrectly diagnosed the case and omitted differentials. Notably, the disease was correctly identified before RAG implementation with accurate knowledge base entries (primary diagnosis). Moreover, qwen2.5:7b consistently failed to diagnose cases with inaccurate knowledge base content.</p></sec><sec id="s3-5"><title>CoT Diagnostic Performance</title><p>The distilled DeepSeek-R1:7b demonstrated significant accuracy degradation when assessed on the same 121 rare diseases. Compared with its base model qwen2.5:7b (47.9% accuracy), DeepSeek-R1:7b achieved only 9.9% diagnostic precision (12/121 cases; &#x0394;&#x2013;38.0%, <italic>&#x03C7;</italic>&#x00B2;<sub>1</sub>=42.19, <italic>P</italic>&#x003C;.001).</p></sec><sec id="s3-6"><title>Survey on Awareness of China&#x2019;s Rare Disease Catalog</title><p>We surveyed 11 clinical physicians from various departments in top-tier Chinese hospitals to assess their familiarity with diseases in China&#x2019;s rare disease catalog. These physicians, especially 3 prenatal diagnosticians, are likely to encounter a wider range of rare diseases due to their practice settings. Departments like obstetrics, pediatrics, surgery, internal medicine, and radiology also commonly encounter rare diseases, while specialists in reproductive medicine may less frequently encounter them.</p><p>China&#x2019;s rare disease catalog has been public since 2018. Data (<xref ref-type="table" rid="table3">Table 3</xref>) show that even experienced prenatal diagnosticians have limited awareness of the catalog&#x2019;s diseases. Specialists in departments with a narrow disease focus, such as reproductive medicine, also tend to lack understanding of rare diseases. Our findings indicate that physicians&#x2019; knowledge of rare diseases is related to their department, experience, and training. Senior physicians generally know more about rare diseases but lack comprehensive knowledge, while junior attending physicians have more knowledge gaps.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Clinical physicians&#x2019; familiarity with diseases from the first batch of China&#x2019;s rare disease catalog.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Specialties and titles</td><td align="left" valign="bottom">Years of experience</td><td align="left" valign="bottom">Aware of catalog, n (%)</td><td align="left" valign="bottom">Able to diagnose, n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">Prenatal diagnosis</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Chief physician</td><td align="left" valign="top">21</td><td align="left" valign="top">112/121 (92.91)</td><td align="left" valign="top">71/121 (58.51)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Associate chief</td><td align="left" valign="top">29</td><td align="left" valign="top">103/121 (85.21)</td><td align="left" valign="top">98/121 (80.99)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">3</td><td align="left" valign="top">85/121 (7.02)</td><td align="left" valign="top">23/121 (18.93)</td></tr><tr><td align="left" valign="top">Obstetrics, n (%)</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">12</td><td align="left" valign="top">22/121 (18.2)</td><td align="left" valign="top">5/121 (4.13)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">12</td><td align="left" valign="top">35/121 (28.93)</td><td align="left" valign="top">15/121 (12.40%)</td></tr><tr><td align="left" valign="top">Pediatrics</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">7</td><td align="left" valign="top">65/121 (53.7)</td><td align="left" valign="top">10/121 (8.26)</td></tr><tr><td align="left" valign="top">Radiology</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">5</td><td align="left" valign="top">37/121 (30.58)</td><td align="left" valign="top">15/121 (12.40)</td></tr><tr><td align="left" valign="top">Surgery</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">4</td><td align="left" valign="top">30/121 (24.79)</td><td align="left" valign="top">16/121 (13.22)</td></tr><tr><td align="left" valign="top">Internal medicine</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">4</td><td align="left" valign="top">17/121 (14.05)</td><td align="left" valign="top">8/121 (6.61)</td></tr><tr><td align="left" valign="top">Reproductive medicine</td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">5</td><td align="left" valign="top">21/121 (17.36%)</td><td align="left" valign="top">1/121 (0.83%)</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Attending physician</td><td align="left" valign="top">4</td><td align="left" valign="top">40/121 (33.31)</td><td align="left" valign="top">13/121 (10.74)</td></tr></tbody></table></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>Our cross-linguistic evaluation identified 3 critical factors governing LLM diagnostic performance. ChatGPT-4o achieved the highest diagnostic accuracy in rare disease assessment, highlighting LLMs&#x2019; potential in this domain. Open-source models showed strong dependence on parameter scale, input language, and developer origin. Implementation of RAG markedly improved qwen2.5:7b&#x2019;s diagnostic accuracy, contrasting sharply with the performance degradation observed in CoT-enhanced models. Clinician surveys reinforced these technical findings, revealing substantial knowledge gaps among specialists&#x2014;even senior practitioners demonstrated limited familiarity with China&#x2019;s Rare Disease Catalog entries, underscoring the clinical necessity for LLM-assisted diagnostic systems.</p></sec><sec id="s4-2"><title>Limitations</title><p>First, our evaluation focused on China&#x2019;s first rare disease catalog, potentially limiting generalizability to newly recognized or ultra-rare conditions. However, this focused approach prioritizes clinically impactful diseases over obscure textbook entries, enhancing practical relevance.</p><p>Second, the diagnostic evaluation was based solely on clinical manifestations, which may not fully reflect the complexity of real-world scenarios. This limitation could potentially reduce the diagnostic accuracy of the models. However, using clinical manifestations is still important for guiding clinicians to consider rare diseases, thereby reducing misdiagnoses and diagnostic delays. In addition, phenotype-driven strategies are valuable for minimizing the costs of genetic testing. Future work should integrate multimodal data (imaging and biomarkers) to better replicate real-world decision-making.</p><p>Third, Chinese-to-English translation via DeepL may introduce subtle inaccuracies. Although the translations were reviewed manually, some errors were inevitably, which potentially reduced the accuracy of the model in English. However, given the strong language comprehension ability of LLMs and the fact that the study results did not indicate higher accuracy for LLMs in Chinese, we believe that the translation process did not affect the statistical conclusions.</p><p>Finally, this study did not assess open-source models with parameters between 8 billion and 70 billion. This was due to the selection of two pairs of LLMs with similar parameters, which allowed for a more detailed comparison across different model aspects. Some models with fewer than 70b parameters may also perform well in diagnosing rare diseases, especially as technology keeps advancing.</p></sec><sec id="s4-3"><title>Comparison With Previous Work</title><p>This study&#x2019;s clinical phenotypes derive from NRDRS, specifically targeting conditions in the nationally mandated catalog. The curated phenotypic profiles intentionally incorporate diagnostic uncertainty through multidisease symptom overlaps, rigorously simulating real-world differential diagnosis challenges for LLM evaluation. Catalog inclusion prioritizes diseases with elevated clinical urgency in China&#x2019;s epidemiological context, characterized by high disease burden, actionable diagnostic criteria, and treatment-responsive outcomes. While global repositories (eg, OMIM [Online Mendelian Inheritance in Man] and Orphanet) provide extensive rare disease data, their geographic prevalence distributions introduce significant epidemiological mismatches for regional implementation. This justifies prioritizing region-specific frameworks for clinical LLM validation.</p><p>Recent studies show ChatGPT can help diagnose rare and complex diseases by analyzing medical histories and test results, offering potential diagnoses and treatment plans [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. This aids health care professionals in faster, more accurate rare disease diagnosis [<xref ref-type="bibr" rid="ref33">33</xref>]. However, ChatGPT-4 and Llama2 are more proficient in diagnosing common diseases than rare ones [<xref ref-type="bibr" rid="ref12">12</xref>], which can be attributed to the limited training data in LLMs. Nevertheless, as LLMs improve, this gap should narrow [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. In our investigation, the latest ChatGPT-4o achieved excellent accuracy diagnosing China&#x2019;s initial rare disease catalog. While we recognize the transformative potential of ChatGPT-4o in rare diseases, we also concur that ChatGPT should be used as a supplementary tool, rather than a substitute for medical expertise [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref34">34</xref>,<xref ref-type="bibr" rid="ref35">35</xref>].</p><p>Open-source LLMs enable hospital-specific diagnostic tools through local deployment, showing potential to surpass commercial models via targeted fine-tuning [<xref ref-type="bibr" rid="ref36">36</xref>,<xref ref-type="bibr" rid="ref37">37</xref>]. However, the proliferation of heterogeneous models (varying parameters or developers) complicates optimal model selection for rare disease applications.</p><p>This study involved deploying 4 open-source LLMs from different vendors with varying parameters to test their ability to diagnose rare diseases in Chinese and English. The goal was to compare diagnostic accuracy across models and assess their performance. Recent research shows ChatGPT performs unevenly across languages, with a big gap between its English skills and other languages [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref38">38</xref>]. For example, ChatGPT scores 82.67% in English sentence completion but only 35.85% in Arabic [<xref ref-type="bibr" rid="ref39">39</xref>]. Its response accuracy drops from 71.49% in English to 42.74% in Arabic. In medical settings, ChatGPT works better with English prompts, even when analyzing Chinese medical reports [<xref ref-type="bibr" rid="ref38">38</xref>]. These findings stress the need for better multilingual models and fixing language biases.</p><p>Our study did not assess the diagnostic prowess of ChatGPT-4o in Chinese, given its established English proficiency. Benchmarking against ChatGPT-4o&#x2019;s English performance enabled systematic assessment of open-source models&#x2019; diagnostic parity with commercial counterparts.</p><p>Our findings confirm a strong positive correlation between model parameters and diagnostic accuracy. The qwen2.5:72b demonstrated a significant improvement in English accuracy over its smaller 7b-parameter counterpart, while Llama3.1:70b achieved accuracy comparable to ChatGPT-4o. Notably, language adaptation capabilities varied significantly by model origin: Chinese-developed qwen2.5 series maintained minimal accuracy variance between Chinese and English, attributable to balanced bilingual training data. In contrast, United States&#x2013;developed Llama3.1 exhibited English-centric biases: the 8B-parameter version showed higher English versus Chinese accuracy, requiring 70b parameters to achieve Chinese diagnostic parity with qwen2.5:72b. In addition, despite significant marginally, Llama3.1:70b&#x2019;s English accuracy surpassed its Chinese performance, highlighting persistent linguistic disparities in cross-regional model development. These results align with emerging evidence that model architecture and training corpus composition (not just parameter scaling) critically determine multilingual diagnostic capability [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref17">17</xref>].</p><p>The current data does not show that ChatGPT-4o&#x2019;s diagnostic capabilities are significantly higher than those of open-source models with parameters around 70 billion, even when compared with the worst-performing diagnostic scenario (Chinese_Llama3.1:70b). Although this study established a strict significance level using Bonferroni adjustment, which prevented the demonstration of a significant difference between Chinese_Llama3.1:70b and ChatGPT-4o, an accuracy exceeding 80% is already sufficient for a foundational model. Domain-specific fine-tuning could further enhance performance for clinical deployment. In addition, in English-speaking countries, smaller models (Llama3.1:8b) with reinforcement strategies achieve acceptable accuracy at reduced computational costs.</p><p>Using the local language for diagnosing rare diseases may offer benefits, as models can access knowledge aligned with a country&#x2019;s disease incidence rates. For example, in our review of generated outputs, Chinese_Llama3.1:70b correctly diagnosed non syndromic deafness (case 83), while English Llama3.1:70b and ChatGPT-4o did not. Chinese Llama3.1:70b identified the <italic>GJB2</italic> gene mutation&#x2013;linked nonsyndromic hearing loss, a common cause in China due to widespread prenatal deafness gene screening and internet-based information. In contrast, English Llama3.1:70b suggested Pendred syndrome, which Chinese physicians usually call &#x201C;Goiter-deafness syndrome&#x201D; and is not in China&#x2019;s rare disease catalog. This suggests LLMs may adapt responses to a user&#x2019;s national context based on language, which could improve diagnostic relevance and accuracy. Instead of just translating to English, incorporating the user&#x2019;s national background in LLM-driven diagnostics could be more effective, especially in models with large parameters.</p><p>Recent studies show RAG can boost LLM performance in medical tasks by integrating external databases, tackling issues like hallucination and outdated knowledge [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref40">40</xref>]. A case study on retrieving medical guidelines and treatment recommendations from curated medical resources demonstrated that RAG significantly improved model performance in terms of factual accuracy, completeness, user preference, and safety compared with standard LLMs [<xref ref-type="bibr" rid="ref41">41</xref>]. Another study developed a customized LLM framework that combines RAG and prompt engineering to accurately interpret medical guidelines for the management of patients with chronic hepatitis C virus infection, thereby improving clinical decision support systems with promising results [<xref ref-type="bibr" rid="ref42">42</xref>]. These studies highlight RAG&#x2019;s potential in medical LLMs. Our study explored RAG to bridge the gap between open-source LLMs and ChatGPT-4o in the diagnosis of rare diseases. Results show that even small-parameter LLMs with RAG and a good knowledge base can match ChatGPT-4o&#x2019;s accuracy. However, issues were identified with the qwen2.5:7b model using RAG, like not following prompts to generate differential diagnoses and incorrectly retrieving answers, even with matching queries. This suggests RAG might struggle with flexible clinical descriptions of rare diseases. RAG&#x2019;s retrieval traceability remains its paramount strength, enabling clinicians to independently verify source validity rather than blindly trusting LLM outputs. Overall, RAG improves open-source models&#x2019; rare disease diagnostic performance, potentially surpassing commercial models, yet underscore the necessity for further optimization to ensure reliable retrievals.</p><p>DeepSeek&#x2019;s recently released CoT-optimized models [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>] present both opportunities and challenges for clinical AI integration. While CoT architectures excel in complex reasoning tasks, particularly arithmetic and commonsense inference, their computational intensity creates latency-performance tradeoffs. Emerging evidence suggests CoT frameworks may reduce diagnostic hallucinations and enhance decision interpretability in medicine [<xref ref-type="bibr" rid="ref43">43</xref>-<xref ref-type="bibr" rid="ref46">46</xref>], although effectiveness varies significantly across model scales, with larger architectures like GPT-4 showing superior adaptability [<xref ref-type="bibr" rid="ref47">47</xref>]. Our findings reveal a paradoxical 38% accuracy decline in DeepSeek-R1:7b versus its base model, likely stemming from domain-specific knowledge loss during medical-focused distillation. This contrasts with code-oriented tasks where distillation typically improves efficacy, suggesting medical diagnosis requires preservation of specialized clinical reasoning patterns vulnerable to compression. To mitigate these limitations, we propose (1) postdistillation medical domain adaptation through targeted fine-tuning, or (2) deployment of larger CoT variants (eg, DeepSeek-R1:32b) to maintain diagnostic fidelity.</p><p>The physician survey confirmed the anticipated correlation between physicians&#x2019; knowledge of rare diseases and factors such as their specialty, professional experience, and training exposure. The scarcity of physicians proficient in diagnosing rare diseases from clinical phenotypes [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref48">48</xref>] contributes to the protracted time frames, often several years, for the confirmation of most rare diseases [<xref ref-type="bibr" rid="ref1">1</xref>]. Phenotype-based tools and research aimed at prioritizing disease-causing genes in genetic disorders have long been pivotal in the diagnosis of rare diseases [<xref ref-type="bibr" rid="ref7">7</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. However, the conversion of natural language into standardized Human Phenotype Ontology terms present a persistent challenge that taxes the knowledge base of clinical physicians, and it is also a process that is both time-consuming and labor-intensive. Given that physicians are adept at medical record documentation, interfacing with LLMs using the medical terminology of clinical diagnostic tasks significantly boosts the efficiency of phenotype-based rare disease diagnosis.</p><p>Despite the current limitations of even the most sophisticated LLM in achieving comprehensive diagnostic accuracy across all rare diseases, the clinical integration of LLM diagnostic tools necessitates a strategic equilibrium between technological innovation and patient safety. To ensure privacy protection, LLMs should be implemented via hospital-based on-premises deployment, operating within isolated intranet environments and achieving seamless integration with existing electronic health record systems to eliminate risks of sensitive data exposure. Throughout diagnostic workflows, LLM outputs must be explicitly labeled as &#x201C;auxiliary recommendations,&#x201D; requiring mandatory human validation by attending physicians who must corroborate findings against comprehensive patient histories, laboratory evidence, and updated clinical guidelines. In rare disease scenarios, definitive diagnoses shall only be established following multidisciplinary consultations and evidence-based literature verification of LLM-generated hypotheses.</p></sec><sec id="s4-4"><title>Conclusions</title><p>In clinical practice where physicians frequently struggle to promptly diagnoses rare diseases, ChatGPT-4o demonstrated superior accuracy in China&#x2019;s rare disease catalog. Moreover, optimizing parameters, language alignment, and pretraining data origins in open-source LLMs, combined with RAG augmentation, enhanced diagnostic precision to near-commercial performance. Caution remains warranted for low-parameter reasoning models showing substantial performance limitations. These findings establish hospital-specific LLM assistants as a feasible pathway for high-accuracy rare disease diagnosis.</p></sec></sec></body><back><ack><p>We are grateful for the support of the following funds: (1) National Key Research and Development Program of China (2023YFC2705600); (2) Capital Clinical Characteristic Diagnosis and Treatment Technology Research and Transformation Application Project (Z221100007422012); (3) Beijing Hospital Management Center "Yangfan" Plan 3.0 Clinical Technology Innovation Project (ZLRK202329); (4) Science and Technology Innovation and Transformation Special Project of Beijing Obstetrics and Gynecology Hospital Affiliated to Capital Medical University/Beijing Maternal and Child Health Hospital (FCYYZH202201)</p></ack><notes><sec><title>Data Availability</title><p>All data generated or analyzed during this study are included in this published article <xref ref-type="supplementary-material" rid="app1">Multimedia Appendices 1</xref><xref ref-type="supplementary-material" rid="app2"/>-<xref ref-type="supplementary-material" rid="app3">3</xref>.</p></sec></notes><fn-group><fn fn-type="con"><p>CY (lead) and YY (equal) were responsible for conceptualization. WZ led data curation, supported by YFL, YL, KY, HG, HY, and WH. WZ also led the formal analysis, with support from YY. CY led funding acquisition, equally supported by YY. WZ developed the methodology, with assistance from YFL, YL, and KY. YY oversaw project administration, with CY and WZ contributing equally. WZ provided the resources. CY led supervision, with equal support from YY. YFL led validation, equally supported by YL, and received additional contributions from KY, HG, HY, and WH. WZ led visualization with support from YFL. WZ prepared the original draft. WZ also led writing &#x2013; review and editing, with support from YY.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CoT</term><def><p>chain-of-thought</p></def></def-item><def-item><term id="abb2">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb3">NRDRS</term><def><p>National Rare Disease Registry System</p></def></def-item><def-item><term id="abb4">OR</term><def><p>odds ratio</p></def></def-item><def-item><term id="abb5">RAG</term><def><p>Retrieval Augmented Generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hartin</surname><given-names>SN</given-names> </name><name name-style="western"><surname>Means</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Alaimo</surname><given-names>JT</given-names> </name><name name-style="western"><surname>Younger</surname><given-names>ST</given-names> </name></person-group><article-title>Expediting rare disease diagnosis: a call to bridge the gap between clinical and functional genomics</article-title><source>Mol Med</source><year>2020</year><month>11</month><day>25</day><volume>26</volume><issue>1</issue><fpage>117</fpage><pub-id pub-id-type="doi">10.1186/s10020-020-00244-5</pub-id><pub-id pub-id-type="medline">33238891</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chung</surname><given-names>CCY</given-names> </name><name name-style="western"><surname>Project</surname><given-names>HKG</given-names> </name><name name-style="western"><surname>Chu</surname><given-names>ATW</given-names> </name><name name-style="western"><surname>Chung</surname><given-names>BHY</given-names> </name></person-group><article-title>Rare disease emerging as a global public health priority</article-title><source>Front Public Health</source><year>2022</year><volume>10</volume><fpage>1028545</fpage><pub-id pub-id-type="doi">10.3389/fpubh.2022.1028545</pub-id><pub-id pub-id-type="medline">36339196</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Umlai</surname><given-names>UK</given-names> </name><name name-style="western"><surname>Bangarusamy</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Estivill</surname><given-names>X</given-names> </name><name name-style="western"><surname>Jithesh</surname><given-names>PV</given-names> </name></person-group><article-title>Genome sequencing data analysis for rare disease gene discovery</article-title><source>Brief Bioinformatics</source><year>2022</year><month>01</month><day>17</day><volume>23</volume><issue>1</issue><pub-id pub-id-type="doi">10.1093/bib/bbab363</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Taroni</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Allaway</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Prasad</surname><given-names>DV</given-names> </name><name name-style="western"><surname>Guinney</surname><given-names>J</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>C</given-names> </name></person-group><article-title>Machine learning in rare disease</article-title><source>Nat Methods</source><year>2023</year><month>06</month><volume>20</volume><issue>6</issue><fpage>803</fpage><lpage>814</lpage><pub-id pub-id-type="doi">10.1038/s41592-023-01886-z</pub-id><pub-id pub-id-type="medline">37248386</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Faviez</surname><given-names>C</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>X</given-names> </name><name name-style="western"><surname>Garcelon</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Diagnosis support systems for rare diseases: a scoping review</article-title><source>Orphanet J Rare Dis</source><year>2020</year><month>04</month><day>16</day><volume>15</volume><issue>1</issue><fpage>94</fpage><pub-id pub-id-type="doi">10.1186/s13023-020-01374-z</pub-id><pub-id pub-id-type="medline">32299466</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McNeill</surname><given-names>A</given-names> </name></person-group><article-title>Good genotype-phenotype relationships in rare disease are hard to find</article-title><source>Eur J Hum Genet</source><year>2022</year><month>03</month><volume>30</volume><issue>3</issue><fpage>251</fpage><pub-id pub-id-type="doi">10.1038/s41431-022-01062-5</pub-id><pub-id pub-id-type="medline">35260823</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lagorce</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lebreton</surname><given-names>E</given-names> </name><name name-style="western"><surname>Matalonga</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Phenotypic similarity-based approach for variant prioritization for unsolved rare disease: a preliminary methodological report</article-title><source>Eur J Hum Genet</source><year>2024</year><month>02</month><volume>32</volume><issue>2</issue><fpage>182</fpage><lpage>189</lpage><pub-id pub-id-type="doi">10.1038/s41431-023-01486-7</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobsen</surname><given-names>JOB</given-names> </name><name name-style="western"><surname>Kelly</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cipriani</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Phenotype-driven approaches to enhance variant prioritization and diagnosis of rare disease</article-title><source>Hum Mutat</source><year>2022</year><month>08</month><volume>43</volume><issue>8</issue><fpage>1071</fpage><lpage>1081</lpage><pub-id pub-id-type="doi">10.1002/humu.24380</pub-id><pub-id pub-id-type="medline">35391505</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>22</day><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="medline">37606976</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Habib</surname><given-names>S</given-names> </name><name name-style="western"><surname>Butt</surname><given-names>H</given-names> </name><name name-style="western"><surname>Goldenholz</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Goldenholz</surname><given-names>DM</given-names> </name></person-group><article-title>Large language model performance on practice epilepsy board examinations</article-title><source>JAMA Neurol</source><year>2024</year><month>06</month><day>1</day><volume>81</volume><issue>6</issue><fpage>660</fpage><lpage>661</lpage><pub-id pub-id-type="doi">10.1001/jamaneurol.2024.0676</pub-id><pub-id pub-id-type="medline">38587850</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hom</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model influence on diagnostic reasoning: a randomized clinical trial</article-title><source>JAMA Netw Open</source><year>2024</year><month>10</month><day>1</day><volume>7</volume><issue>10</issue><fpage>e2440969</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.40969</pub-id><pub-id pub-id-type="medline">39466245</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riepenhausen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plagwitz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Varghese</surname><given-names>J</given-names> </name></person-group><article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title><source>Nat Commun</source><year>2024</year><month>03</month><day>6</day><volume>15</volume><issue>1</issue><fpage>2050</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id><pub-id pub-id-type="medline">38448475</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Han</surname><given-names>M</given-names> </name><etal/></person-group><article-title>RDmaster: A novel phenotype-oriented dialogue system supporting differential diagnosis of rare disease</article-title><source>Comput Biol Med</source><year>2024</year><month>02</month><volume>169</volume><fpage>107924</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.107924</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhai</surname><given-names>W</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name></person-group><article-title>Phen2Disease: a phenotype-driven model for disease and gene prioritization by bidirectional maximum matching semantic similarities</article-title><source>Brief Bioinform</source><year>2023</year><month>07</month><day>20</day><volume>24</volume><issue>4</issue><fpage>bbad172</fpage><pub-id pub-id-type="doi">10.1093/bib/bbad172</pub-id><pub-id pub-id-type="medline">37248747</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Ran</surname><given-names>AR</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>TX</given-names> </name><etal/></person-group><article-title>What can GPT-4 do for diagnosing rare eye diseases? A pilot study</article-title><source>Ophthalmol Ther</source><year>2023</year><month>12</month><volume>12</volume><issue>6</issue><fpage>3395</fpage><lpage>3402</lpage><pub-id pub-id-type="doi">10.1007/s40123-023-00789-8</pub-id><pub-id pub-id-type="medline">37656399</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Segal</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khanna</surname><given-names>AK</given-names> </name></person-group><article-title>Anesthetic management of a patient with juvenile hyaline fibromatosis: a case report written with the assistance of the large language model ChatGPT</article-title><source>Cureus</source><year>2023</year><month>03</month><volume>15</volume><issue>3</issue><fpage>e35946</fpage><pub-id pub-id-type="doi">10.7759/cureus.35946</pub-id><pub-id pub-id-type="medline">37038572</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jagadeesh</surname><given-names>A</given-names> </name><etal/></person-group><article-title>The potential and pitfalls of using a large language model such as ChatGPT, GPT-4, or LLaMA as a clinical assistant</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1884</fpage><lpage>1891</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae184</pub-id><pub-id pub-id-type="medline">39018498</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Towards building multilingual language model for medicine</article-title><source>Nat Commun</source><year>2024</year><month>09</month><day>27</day><volume>15</volume><issue>1</issue><fpage>8384</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-52417-z</pub-id><pub-id pub-id-type="medline">39333468</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Fine-tuning large language models for rare disease concept normalization</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2076</fpage><lpage>2083</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae133</pub-id><pub-id pub-id-type="medline">38829731</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zarfati</surname><given-names>M</given-names> </name><name name-style="western"><surname>Soffer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Retrieval-augmented generation: advancing personalized care and research in oncology</article-title><source>Eur J Cancer</source><year>2025</year><month>05</month><day>2</day><volume>220</volume><fpage>115341</fpage><pub-id pub-id-type="doi">10.1016/j.ejca.2025.115341</pub-id><pub-id pub-id-type="medline">40068371</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ge</surname><given-names>J</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>S</given-names> </name><name name-style="western"><surname>Owens</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Development of a liver disease-specific large language model chat interface using retrieval-augmented generation</article-title><source>Hepatology</source><year>2024</year><month>11</month><day>1</day><volume>80</volume><issue>5</issue><fpage>1158</fpage><lpage>1168</lpage><pub-id pub-id-type="doi">10.1097/HEP.0000000000000834</pub-id><pub-id pub-id-type="medline">38451962</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibney</surname><given-names>E</given-names> </name></person-group><article-title>China&#x2019;s cheap, open AI model DeepSeek thrills scientists</article-title><source>Nature New Biol</source><year>2025</year><month>02</month><day>6</day><volume>638</volume><issue>8049</issue><fpage>13</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1038/d41586-025-00229-6</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibney</surname><given-names>E</given-names> </name></person-group><article-title>Scientists flock to DeepSeek: how they&#x2019;re using the blockbuster AI model</article-title><source>Nature New Biol</source><year>2025</year><month>01</month><day>29</day><pub-id pub-id-type="doi">10.1038/d41586-025-00275-0</pub-id><pub-id pub-id-type="medline">39881178</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Temsah</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alhasan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Altamimi</surname><given-names>I</given-names> </name><etal/></person-group><article-title>DeepSeek in healthcare: revealing opportunities and steering challenges of a new open-source artificial intelligence frontier</article-title><source>Cureus</source><year>2025</year><month>02</month><volume>17</volume><issue>2</issue><fpage>e79221</fpage><pub-id pub-id-type="doi">10.7759/cureus.79221</pub-id><pub-id pub-id-type="medline">39974299</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Conroy</surname><given-names>G</given-names> </name><name name-style="western"><surname>Mallapaty</surname><given-names>S</given-names> </name></person-group><article-title>How China created AI model DeepSeek and shocked the world</article-title><source>Nature New Biol</source><year>2025</year><month>02</month><day>13</day><volume>638</volume><issue>8050</issue><fpage>300</fpage><lpage>301</lpage><pub-id pub-id-type="doi">10.1038/d41586-025-00259-0</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="web"><source>National rare diseases registry system of China</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nrdrs.org.cn/xhrareweb/homeIndex">https://www.nrdrs.org.cn/xhrareweb/homeIndex</ext-link></comment></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="web"><source>DeepL translation: the world&#x2019;s most accurate translation</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.deepl.com/translator">https://www.deepl.com/translator</ext-link></comment></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>Ollama</collab></person-group><source>GitHub</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/ollama/ollama">https://github.com/ollama/ollama</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="web"><source>swuecho/chat: chat web app for teams, sass with user management and ratelimit, support chatgpt (openai &#x0026; azure), claude, gemini and ollama model</source><access-date>2024-12-09</access-date><publisher-name>GitHub</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/swuecho/chat">https://github.com/swuecho/chat</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="web"><person-group person-group-type="author"><collab>FIT2CLOUD</collab></person-group><source>MaxKB is a knowledge-base question-answering system based on large language model and RAG</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://maxkb.cn/index.html">https://maxkb.cn/index.html</ext-link></comment></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wojtara</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rana</surname><given-names>E</given-names> </name><name name-style="western"><surname>Rahman</surname><given-names>T</given-names> </name><name name-style="western"><surname>Khanna</surname><given-names>P</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>H</given-names> </name></person-group><article-title>Artificial intelligence in rare disease diagnosis and treatment</article-title><source>Clin Transl Sci</source><year>2023</year><month>11</month><volume>16</volume><issue>11</issue><fpage>2106</fpage><lpage>2111</lpage><pub-id pub-id-type="doi">10.1111/cts.13619</pub-id><pub-id pub-id-type="medline">37646577</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Duan</surname><given-names>H</given-names> </name><name name-style="western"><surname>Li</surname><given-names>H</given-names> </name></person-group><article-title>A robust phenotype-driven likelihood ratio analysis approach assisting interpretable clinical diagnosis of rare diseases</article-title><source>J Biomed Inform</source><year>2023</year><month>06</month><volume>142</volume><fpage>104372</fpage><pub-id pub-id-type="doi">10.1016/j.jbi.2023.104372</pub-id><pub-id pub-id-type="medline">37105510</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zheng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>X</given-names> </name><name name-style="western"><surname>Feng</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Rare and complex diseases in focus: ChatGPT&#x2019;s role in improving diagnosis and treatment</article-title><source>Front Artif Intell</source><year>2024</year><volume>7</volume><fpage>1338433</fpage><pub-id pub-id-type="doi">10.3389/frai.2024.1338433</pub-id><pub-id pub-id-type="medline">38283995</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><month>06</month><day>28</day><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568</pub-id><pub-id pub-id-type="medline">37379067</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mizuta</surname><given-names>K</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Comparative evaluation of diagnostic accuracy between google bard and physicians</article-title><source>Am J Med</source><year>2023</year><month>11</month><volume>136</volume><issue>11</issue><fpage>1119</fpage><lpage>1123</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2023.08.003</pub-id><pub-id pub-id-type="medline">37643659</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>W</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>PMC-LLaMA: toward building open-source language models for medicine</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1833</fpage><lpage>1843</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae045</pub-id><pub-id pub-id-type="medline">38613821</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ehrett</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hegde</surname><given-names>S</given-names> </name><name name-style="western"><surname>Andre</surname><given-names>K</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Wilson</surname><given-names>T</given-names> </name></person-group><article-title>Leveraging open-source large language models for data augmentation in hospital staff surveys: mixed methods study</article-title><source>JMIR Med Educ</source><year>2024</year><month>11</month><day>19</day><volume>10</volume><fpage>e51433</fpage><pub-id pub-id-type="doi">10.2196/51433</pub-id><pub-id pub-id-type="medline">39560937</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Shao</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Uncovering language disparity of ChatGPT on retinal vascular disease classification: cross-sectional study</article-title><source>J Med Internet Res</source><year>2024</year><month>01</month><day>22</day><volume>26</volume><fpage>e51926</fpage><pub-id pub-id-type="doi">10.2196/51926</pub-id><pub-id pub-id-type="medline">38252483</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Alkaoud</surname><given-names>M</given-names> </name></person-group><article-title>A bilingual benchmark for evaluating large language models</article-title><source>PeerJ Comput Sci</source><year>2024</year><volume>10</volume><fpage>e1893</fpage><pub-id pub-id-type="doi">10.7717/peerj-cs.1893</pub-id><pub-id pub-id-type="medline">38435597</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bhayana</surname><given-names>R</given-names> </name><name name-style="western"><surname>Fawzy</surname><given-names>A</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bleakney</surname><given-names>RR</given-names> </name><name name-style="western"><surname>Krishna</surname><given-names>S</given-names> </name></person-group><article-title>Retrieval-augmented generation for large language models in radiology: another leap forward in board examination performance</article-title><source>Radiology</source><year>2024</year><month>10</month><volume>313</volume><issue>1</issue><fpage>e241489</fpage><pub-id pub-id-type="doi">10.1148/radiol.241489</pub-id><pub-id pub-id-type="medline">39377675</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zakka</surname><given-names>C</given-names> </name><name name-style="western"><surname>Shad</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chaurasia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Almanac - retrieval-augmented language models for clinical medicine</article-title><source>NEJM AI</source><year>2024</year><month>02</month><volume>1</volume><issue>2</issue><pub-id pub-id-type="doi">10.1056/aioa2300068</pub-id><pub-id pub-id-type="medline">38343631</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kresevic</surname><given-names>S</given-names> </name><name name-style="western"><surname>Giuffr&#x00E8;</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ajcevic</surname><given-names>M</given-names> </name><name name-style="western"><surname>Accardo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Croc&#x00E8;</surname><given-names>LS</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name></person-group><article-title>Optimization of hepatological clinical guidelines interpretation by large language models: a retrieval augmented generation-based framework</article-title><source>NPJ Digit Med</source><year>2024</year><month>04</month><day>23</day><volume>7</volume><issue>1</issue><fpage>102</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01091-y</pub-id><pub-id pub-id-type="medline">38654102</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Temsah</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Jamal</surname><given-names>A</given-names> </name><name name-style="western"><surname>Alhasan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Temsah</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Malki</surname><given-names>KH</given-names> </name></person-group><article-title>OpenAI o1-preview vs. ChatGPT in healthcare: a new frontier in medical AI reasoning</article-title><source>Cureus</source><year>2024</year><month>10</month><volume>16</volume><issue>10</issue><fpage>e70640</fpage><pub-id pub-id-type="doi">10.7759/cureus.70640</pub-id><pub-id pub-id-type="medline">39359332</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Traverso</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dekker</surname><given-names>A</given-names> </name><name name-style="western"><surname>Qian</surname><given-names>L</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>P</given-names> </name></person-group><article-title>Assessing the role of GPT-4 in thyroid ultrasound diagnosis and treatment recommendations: enhancing interpretability with a chain of thought approach</article-title><source>Quant Imaging Med Surg</source><year>2024</year><month>02</month><volume>14</volume><issue>2</issue><fpage>1602</fpage><lpage>1615</lpage><pub-id pub-id-type="doi">10.21037/qims-23-1180</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>J</given-names> </name><name name-style="western"><surname>Thongprayoon</surname><given-names>C</given-names> </name><name name-style="western"><surname>Suppadungsuk</surname><given-names>S</given-names> </name><name name-style="western"><surname>Krisanapan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Radhakrishnan</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Cheungpasitporn</surname><given-names>W</given-names> </name></person-group><article-title>Chain of thought utilization in large language models and application in nephrology</article-title><source>Medicina (Kaunas)</source><year>2024</year><month>01</month><day>13</day><volume>60</volume><issue>1</issue><fpage>148</fpage><pub-id pub-id-type="doi">10.3390/medicina60010148</pub-id><pub-id pub-id-type="medline">38256408</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>R</given-names> </name></person-group><article-title>RT: a Retrieving and Chain-of-Thought framework for few-shot medical named entity recognition</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1929</fpage><lpage>1938</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae095</pub-id><pub-id pub-id-type="medline">38708849</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hebenstreit</surname><given-names>K</given-names> </name><name name-style="western"><surname>Praas</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kiesewetter</surname><given-names>LP</given-names> </name><name name-style="western"><surname>Samwald</surname><given-names>M</given-names> </name></person-group><article-title>A comparison of chain-of-thought reasoning strategies across datasets and models</article-title><source>PeerJ Comput Sci</source><year>2024</year><volume>10</volume><fpage>e1999</fpage><pub-id pub-id-type="doi">10.7717/peerj-cs.1999</pub-id><pub-id pub-id-type="medline">38855241</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anderson</surname><given-names>D</given-names> </name><name name-style="western"><surname>Baynam</surname><given-names>G</given-names> </name><name name-style="western"><surname>Blackwell</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Lassmann</surname><given-names>T</given-names> </name></person-group><article-title>Personalised analytics for rare disease diagnostics</article-title><source>Nat Commun</source><year>2019</year><month>11</month><day>21</day><volume>10</volume><issue>1</issue><fpage>5274</fpage><pub-id pub-id-type="doi">10.1038/s41467-019-13345-5</pub-id><pub-id pub-id-type="medline">31754101</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kelly</surname><given-names>C</given-names> </name><name name-style="western"><surname>Szabo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pontikos</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Phenotype-aware prioritisation of rare Mendelian disease variants</article-title><source>Trends Genet</source><year>2022</year><month>12</month><volume>38</volume><issue>12</issue><fpage>1271</fpage><lpage>1283</lpage><pub-id pub-id-type="doi">10.1016/j.tig.2022.07.002</pub-id><pub-id pub-id-type="medline">35934592</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>W</given-names> </name><etal/></person-group><article-title>Enhancing phenotype recognition in clinical notes using large language models: PhenoBCBERT and PhenoGPT</article-title><source>Patterns (N Y)</source><year>2024</year><month>01</month><day>12</day><volume>5</volume><issue>1</issue><fpage>100887</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2023.100887</pub-id><pub-id pub-id-type="medline">38264716</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>This supplementary Material includes all the chat records of diagnoses for China's first batch of rare disease catalog by ChatGPT-4o and the four LLMs in this study. The document contains 11 collections of chat contents. The diagnostic sequence of the LLMs for the cases can be found in Multimedia Appendix 2.</p><media xlink:href="jmir_v27i1e69929_app1.zip" xlink:title="ZIP File, 2056 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2</label><p>This supplementary material encompasses the detailed diagnostic outcomes of ChatGPT-4o and the four LLMs for the cases in China's first batch of rare disease catalog. It includes a total of 12 sheets, presenting the specific diagnostic results and the comparative summary.</p><media xlink:href="jmir_v27i1e69929_app2.xlsx" xlink:title="XLSX File, 77 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>This file contains the R code used in this study.</p><media xlink:href="jmir_v27i1e69929_app3.txt" xlink:title="TXT File, 8 KB"/></supplementary-material></app-group></back></article>