<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e73233</article-id><article-id pub-id-type="doi">10.2196/73233</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Enhancing the Accuracy of Human Phenotype Ontology Identification: Comparative Evaluation of Multimodal Large Language Models</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zhong</surname><given-names>Wei</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Sun</surname><given-names>Mingyue</given-names></name><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yao</surname><given-names>Shun</given-names></name><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>YiFan</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Peng</surname><given-names>Dingchuan</given-names></name><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Yan</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Kai</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gao</surname><given-names>HuiMin</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yan</surname><given-names>HuiHui</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hao</surname><given-names>WenJing</given-names></name><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yan</surname><given-names>YouSheng</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Yin</surname><given-names>ChengHong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Prenatal Diagnosis, Beijing Obstetrics and Gynecology Hospital, Capital Medical University, Beijing Maternal and Child Health Care Hospital</institution><addr-line>251 Yaojiayuan Road, Chaoyang District</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Reproductive Medicine, Shijiazhuang People's Hospital, Hebei Province</institution><addr-line>Shijiazhuang</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Gynecology and Obstetrics, Yijishan Hospital of Wannan Medical College, Anhui province</institution><addr-line>Wuhu</addr-line><country>China</country></aff><aff id="aff4"><institution>School of Medicine, South China University of Technology, Guangdong Province</institution><addr-line>Guangzhou</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Adeoye</surname><given-names>Adekunle</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Agu</surname><given-names>Chiamaka Pamela</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Maheshwari</surname><given-names>Harsh</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to ChengHong Yin, MD, Department of Prenatal Diagnosis, Beijing Obstetrics and Gynecology Hospital, Capital Medical University, Beijing Maternal and Child Health Care Hospital, 251 Yaojiayuan Road, Chaoyang District, Beijing, 100020, China, 86 15572779093; <email>yinchh@ccmu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>2</day><month>6</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e73233</elocation-id><history><date date-type="received"><day>28</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>20</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>21</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Wei Zhong, Mingyue Sun, Shun Yao, YiFan Liu, Dingchuan Peng, Yan Liu, Kai Yang, HuiMin Gao, HuiHui Yan, WenJing Hao, YouSheng Yan, ChengHong Yin. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 2.6.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e73233"/><abstract><sec><title>Background</title><p>Identifying Human Phenotype Ontology (HPO) terms is crucial for diagnosing and managing rare diseases. However, clinicians, especially junior physicians, often face challenges due to the complexity of describing patient phenotypes accurately. Traditional manual search methods using HPO databases are time-consuming and prone to errors.</p></sec><sec><title>Objective</title><p>The aim of the study is to investigate whether the use of multimodal large language models (MLLMs) can improve the accuracy of junior physicians in identifying HPO terms from patient images related to rare diseases.</p></sec><sec sec-type="methods"><title>Methods</title><p>In total, 20 junior physicians from 10 specialties participated. Each physician evaluated 27 patient images sourced from publicly available literature, with phenotypes relevant to rare diseases listed in the Chinese Rare Disease Catalogue. The study was divided into 2 groups: the manual search group relied on the Chinese Human Phenotype Ontology website, while the MLLM-assisted group used an electronic questionnaire that included HPO terms preidentified by ChatGPT-4o as prompts, followed by a search using the Chinese Human Phenotype Ontology. The primary outcome was the accuracy of HPO identification, defined as the proportion of correctly identified HPO terms compared to a standard set determined by an expert panel. Additionally, the accuracy of outputs from ChatGPT-4o and 2 open-source MLLMs (Llama3.2:11b and Llama3.2:90b) was evaluated using the same criteria, with hallucinations for each model documented separately. Furthermore, participating physicians completed an additional electronic questionnaire regarding their rare disease background to identify factors affecting their ability to accurately describe patient images using standardized HPO terms.</p></sec><sec sec-type="results"><title>Results</title><p>A total of 270 descriptions were evaluated per group. The MLLM-assisted group achieved a significantly higher accuracy rate of 67.4% (182/270) compared to 20.4% (55/270) in the manual group (relative risk 3.31, 95% CI 2.58&#x2010;4.25; <italic>P</italic>&#x003C;.001). The MLLM-assisted group demonstrated consistent performance across departments, whereas the manual group exhibited greater variability. Among standalone MLLMs, ChatGPT-4o achieved an accuracy of 48% (13/27), while the open-source models Llama3.2:11b and Llama3.2:90b achieved 15% (4/27) and 18% (5/27), respectively. However, MLLMs exhibited a high hallucination rate, frequently generating HPO terms with incorrect IDs or entirely fabricated content. Specifically, ChatGPT-4o, Llama3.2:11b, and Llama3.2:90b generated incorrect IDs in 57.3% (67/117), 98% (62/63), and 82% (46/56) of cases, respectively, and fabricated terms in 34.2% (40/117), 41% (26/63), and 32% (18/56) of cases, respectively. Additionally, a survey on the rare disease knowledge of junior physicians suggests that participation in rare disease and genetic disease training may enhance the performance of some physicians.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The integration of MLLMs into clinical workflows significantly enhances the accuracy of HPO identification by junior physicians, offering promising potential to improve the diagnosis of rare diseases and standardize phenotype descriptions in medical research. However, the notable hallucination rate observed in MLLMs underscores the necessity for further refinement and rigorous validation before widespread adoption in clinical practice.</p></sec></abstract><kwd-group><kwd>multimodal large language models</kwd><kwd>ChatGPT</kwd><kwd>rare diseases</kwd><kwd>human phenotype ontology</kwd><kwd>open-source LLMs</kwd><kwd>large language model</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>The Human Phenotype Ontology (HPO) is a comprehensive and standardized vocabulary designed to describe phenotypic abnormalities associated with over 8100 diseases [<xref ref-type="bibr" rid="ref1">1</xref>]. It has become the de facto standard for deep phenotyping in rare diseases and is widely used by researchers, clinicians, informaticians, and electronic health record systems globally [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref4">4</xref>]. The HPO&#x2019;s detailed descriptions and computable disease definitions enhance diagnostic accuracy, especially when integrated with model organism data [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. It is also a core component of tools like Face2Gene [<xref ref-type="bibr" rid="ref8">8</xref>] and Exomiser [<xref ref-type="bibr" rid="ref9">9</xref>], which identify disease-causing variants from sequencing data [<xref ref-type="bibr" rid="ref1">1</xref>]. The HPO&#x2019;s interoperability enables integration with other ontologies, advancing genomics and phenomics research [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref10">10</xref>]. As the HPO evolves, its user base grows, and the project team continually expands its content, language translations, mappings, and computational tools to meet increasing demands [<xref ref-type="bibr" rid="ref6">6</xref>].</p><p>Despite the significant utility of the HPO in clinical and research settings, its practical application faces several challenges. First, the HPO includes over 18,000 terms and more than 156,000 annotations for genetic disorders [<xref ref-type="bibr" rid="ref1">1</xref>], organized in a logically structured hierarchy with the most specific terms at the periphery. This complex framework makes it difficult for clinicians and researchers to fully understand and accurately apply all terms, increasing the risk of omissions or misapplications. Second, the presence of semantically similar terms and synonyms within the HPO complicates term identification and matching, further hindering its use. Most notably, even with user-friendly web-based interfaces [<xref ref-type="bibr" rid="ref11">11</xref>], describing patients&#x2019; abnormal phenotypes using standardized HPO terms remains a significant challenge for less experienced physicians. This difficulty arises from variations in language use and the inherent complexity of human anatomy [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref6">6</xref>]. These barriers limit the broader adoption of the HPO in electronic health record systems and research publications, potentially restricting its impact on advancing rare disease diagnosis and precision medicine. Inadequate patient phenotyping and inaccuracies in clinical descriptions are key factors contributing to the prolonged diagnostic odyssey faced by many individuals with rare diseases, often requiring years and consultations with multiple specialists to achieve an accurate diagnosis [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p><p>Large language model (LLM), exemplified by ChatGPT, has drawn significant attention since their public release in 2022, heralded as catalysts for the fourth industrial revolution [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. These models can respond to free-text queries without task-specific training, sparking both excitement and concern about their potential use in health care [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. Initially designed for text-based tasks, LLMs have shown promising but inconsistent performance across various medical applications [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref26">26</xref>]. With technological advancements, models like ChatGPT have improved their ability to generate high-quality responses comparable to those of experienced medical professionals [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref31">31</xref>]. However, their reliance on text-only processing remains a limitation for addressing comprehensive medical scenarios.</p><p>In recent years, multimodal large language models (MLLMs) have emerged as a significant advancement in natural language processing and computer vision, demonstrating substantial potential for integrating medical image and text analysis. For example, studies using ChatGPT-4o, the latest multimodal version of ChatGPT, have shown high diagnostic accuracy with text and image inputs, outperforming medical students on <italic>New England Journal of Medicine</italic> Image Challenge cases [<xref ref-type="bibr" rid="ref32">32</xref>]. However, evaluations of ChatGPT on the Japanese National Medical Licensing Examination highlight ongoing challenges in achieving adequate diagnostic accuracy [<xref ref-type="bibr" rid="ref33">33</xref>]. Overall, research on applying MLLMs to medical image analysis remains limited [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. The potential of MLLMs to integrate images and text opens new possibilities for richer medical applications, including the identification of HPO terms from patient images. We propose that investigating MLLMs in HPO description analysis will not only advance automated term recognition and semantic matching in clinical HPO applications but also significantly contribute to the evolution of precision medicine, improving diagnostic accuracy and therapeutic strategies for rare disease management. The innovation of this study lies in advancing the application of LLMs in medicine from pure text to multimodal integration, addressing the critical challenge of accurately describing human phenotypes using HPO terms in clinical practice.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Patient Phenotype Images</title><p>The Chinese Rare Disease Catalogue includes 207 rare diseases [<xref ref-type="bibr" rid="ref34">34</xref>], selected by the Chinese government based on criteria such as incidence rate, disease severity, and diagnostic clarity. Most of these diseases are genetic, making the catalog a valuable resource for studying clinically significant disease phenotypes. Patient images were sourced from the Open-i [<xref ref-type="bibr" rid="ref35">35</xref>], using disease names from the catalog. Inclusion criteria were (1) high image quality, ensuring clarity; (2) a distinct presentation of phenotypic features; (3) the presence of a caption describing the patient&#x2019;s phenotype; and (4) relevance of the depicted phenotype to the searched disease. A total of 27 images meeting these criteria were included, each displaying abnormal physical characteristics associated with the respective diseases. Web links to the images are provided in Sheet 1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Some images were cropped to highlight relevant phenotypic features.</p></sec><sec id="s2-2"><title>Ethical Considerations</title><p>This study used publicly available patient images from the internet for descriptive analysis of features. No processing was performed on the images, and the manuscript does not contain any identifiable patient images. As a simulation of clinical diagnostic trials without intervention on real patients, this study complies with the institutional guidelines for ethics committee exemption.</p></sec><sec id="s2-3"><title>Phenotype Recognition Using MLLMs</title><p>To identify phenotypes in the selected patient images, we initially used ChatGPT-4o using the following prompt:</p><disp-quote><p>Now I will provide you with pictures of patients from open access literature, without involving patient privacy. You need to identify the content of the picture and answer which of the most obvious Human Phenotype Ontology (HPO) terms are shown in the patient in this picture. Each picture may contain one or more HPO terms. Only answer the most obvious ones, and do not answer the HPO terms you cannot judge. The answer needs to include the names of the HPO terms, and explain and attach the HPO ID.</p></disp-quote><p>As the research team&#x2019;s native language is Chinese, all interactions with ChatGPT-4o were conducted in Chinese. For each image, the command &#x201C;please describe the most prominent HPO terms in this picture&#x201D; was used. In specific cases, such as an image of a patient with albinism, contextual details (eg, the patient&#x2019;s Asian descent) were provided to ensure accurate phenotype recognition.</p><p>To further evaluate phenotype recognition capabilities, we tested 2 open-source MLLMs developed by Meta&#x2014;Llama3.2:11b and Llama3.2:90b. These models were deployed locally using the Ollama inference framework (version 0.5.10; Ollama) and Cherry Studio (version 1.0.0; Shanghai Qianhui Technology), a user-friendly software for model operation. The recognition process for them followed the same protocol as ChatGPT-4o. Complete chat logs, including prompts, responses, and identified HPO terms, are available in Sheet 2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-4"><title>Study Design</title><sec id="s2-4-1"><title>Junior Physicians Recruitment</title><p>To minimize the influence of varying clinical experience, the study recruited 20 junior physicians, including graduate students in clinical medicine and attending physicians, from diverse hospital backgrounds and specialties. Participants were evenly distributed across 10 fields: gynecology (n=2), obstetrics (n=2), gradiology (n=2), orthopedics (n=2), obstetrics and gynecology (n=2), surgery (n=2), reproductive medicine (n=2), pediatrics (n=2), internal medicine (n=2), and oncology (n=2). The study design is illustrated in <xref ref-type="fig" rid="figure1">Figures 1</xref> and <xref ref-type="fig" rid="figure2">2</xref>.</p><p>Participants, who were colleagues, classmates, or acquaintances of the researchers, were tasked with providing detailed phenotype descriptions for 27 patient images sourced from public web-based platforms. They were compensated US $14 to ensure engagement and diligent completion of the task.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Overall flowchart of the study design. This figure illustrates the study&#x2019;s methodology for evaluating the impact of MLLMs on the accuracy of HPO identification by junior physicians. Starting with a selection of 27 images from the Open-i database based on the Chinese Rare Disease Catalogue, the images were processed for quality and relevance. In total, 20 junior physicians were randomly divided into 2 groups: one using the Chinese Human Phenotype Ontology website for manual search and the other assisted by preidentified HPO terms from ChatGPT-4o. Additionally, ChatGPT-4o and 2 open-source MLLMs were tested for standalone HPO identification. An expert panel of genetic counselors established standard HPO terms for accuracy assessment. The study concluded with a statistical analysis of HPO description accuracy and MLLM hallucination rates based on questionnaire results from both groups. The Open-i service of the National Library of Medicine facilitates the search and retrieval of abstracts and images. HPO: Human Phenotype Ontology; MLLM: multimodal large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73233_fig01.png"/></fig><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Workflow for patient image recognition using HPO by junior physicians in both cohorts. This figure depicts a sample image of a patient with Hutchinson-Gilford progeria syndrome. Both groups used only the image and their expertise to characterize the patient&#x2019;s phenotype with HPO terms. The MLLM-assisted cohort further used pregenerated ChatGPT-4o prompts, potentially enhancing identification accuracy. Test images encompassed additional anatomical regions, including hands, legs, and skin. HPO: Human Phenotype Ontology; MLLM: multimodal large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73233_fig02.png"/></fig></sec><sec id="s2-4-2"><title>Group Stratification and Task Assignment</title><p>The 20 physicians were stratified by specialty and randomly assigned to 2 groups. Both groups evaluated the same 27 images through a web-based platform but used different approaches:</p><list list-type="order"><list-item><p>Manual search group: participants received a questionnaire and were instructed to use the Chinese Human Phenotype Ontology (CHPO) website [<xref ref-type="bibr" rid="ref36">36</xref>] search for the most appropriate HPO to describe the phenotypes in the images. They were informed that multiple terms might apply to a single image. The CHPO was chosen for its ability to provide standardized HPO terms in Chinese, ensuring consistency across participants.</p></list-item><list-item><p>MLLM-assisted group: participants received a questionnaire where each image was accompanied by HPO terms preidentified by ChatGPT-4o. They were instructed to independently verify and search for appropriate HPO terms using the CHPO.</p></list-item></list></sec><sec id="s2-4-3"><title>Measures to Prevent Bias</title><p>To prevent psychological bias, participants were not informed that 1 questionnaire included HPO terms identified by ChatGPT-4o. The original results of the 540 completed questionnaires can be found in Sheet 3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Additionally, following the completion of the phenotype recognition questionnaire, we distributed a supplementary survey to the 20 clinicians to assess their background knowledge in rare diseases. The survey comprised the following items: years of clinical practice, number of rare diseases encountered annually, attendance at training sessions on rare and genetic diseases, prior awareness of HPO, HPO searches before the survey, and the necessity of describing patient phenotypes in clinical practice. More crucially, the survey also explored the factor they found most challenging in HPO image description after completing the questionnaire.</p></sec></sec><sec id="s2-5"><title>Setting the Standards for HPO</title><p>Accurately determining HPO terms from 2D images is challenging due to factors such as camera angle, lighting, and pixelation, which can obscure phenotypic features. Additionally, the hierarchical nature of HPO terms introduces variability, as descriptions often depend on the describer&#x2019;s knowledge and subjective interpretation.</p><p>To address these challenges, an expert panel of 3 senior genetic counselors from a prenatal diagnosis center&#x2014;the chief physician, the deputy chief physician, and the deputy chief technologist of the center&#x2019;s laboratory&#x2014;was convened. All 3 are experienced geneticists with extensive HPO expertise. The panel systematically reviewed each patient image, paired with phenotype descriptions from the original literature, to interpret the patients&#x2019; phenotypes accurately. Through collaborative discussions and consultation with the CHPO, they established a set of correct HPO terms for each image. Despite the inherent subjectivity of HPO descriptions, disagreements were rare and typically resolved by prioritizing consensus. In cases of differing opinions, the majority view (2 of 3) prevailed, though contentious terms were retained for comprehensiveness. If no consensus emerged, the chief physician&#x2019;s decision would take precedence&#x2014;though this was not required in our study. A researcher then used the panel&#x2019;s input to draft preliminary diagnostic criteria.</p><p>Following this, the junior physicians submitted 540 HPO terms across 27 images, which the panel evaluated after completing their questionnaires. Each image was presented in a slide format, displaying the preliminary diagnostic criteria alongside the 20 corresponding questionnaire responses. The panel reviewed these slides, synthesizing all data and resolving discrepancies through further discussion to finalize the standard HPO terms (Sheets 4 and 5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This rigorous process ensured the accuracy and reliability of the HPO term standards, establishing the final &#x201C;gold standard&#x201D; for the study. The approach to handling disagreements remained consistent throughout, aligning with the method used for the preliminary criteria.</p></sec><sec id="s2-6"><title>Study Outcomes</title><p>The primary outcome was the accuracy of HPO descriptions provided by the 2 groups of junior physicians for patient images. A description was considered correct if it included all standard HPO terms for a given image; missing any standard term rendered it incorrect.</p><p>A strict requirement for complete consistency with standard HPO terms was intentionally avoided. This approach acknowledges the complexity of patient images, which may depict a wide range of phenotypic features, some of which may be insignificant or open to interpretation. In clinical practice, physicians&#x2019; observations are often subjective, and capturing the most critical phenotypic features&#x2014;those essential for diagnosis and treatment&#x2014;is sufficient. Thus, descriptions that included the most important HPO terms were deemed accurate. This assessment method was designed to reduce rigidity and better reflect the practical challenges of describing rare disease phenotypes in clinical settings.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>Before recruiting junior physicians, 2 researchers independently provided HPO descriptions for all 27 patient images&#x2014;one using manual search methods and the other using MLLM-assisted approaches. Preliminary estimates indicated an accuracy rate of 44% for manual search and 74% for MLLM-assisted. Sample size calculations were performed using R software (version 4.3.2; R Foundation for Statistical Computing) with the <italic>pwr</italic> package. With &#x03B1;=.05 and a power of 0.8, the minimum required sample size per group was approximately 41 (n=40.7). While 2 junior physicians per group evaluating the 27 images would meet this requirement, 20 physicians were recruited to include a broader range of specialties and minimize potential biases.</p><p>The Pearson chi-square test was used to compare HPO description accuracy rates between the 2 groups. Statistical significance was set at <italic>P</italic>&#x003C;.05. The McNemar test was used to compare interspecialty variability between the 2 groups, specifically assessing differences in discordant pairs (cases where one group was correct and the other incorrect). The exact conditional McNemar test, implemented via the <italic>exact2x2</italic> package, was applied to ensure precision. The odds ratio, defined as the ratio of group 1-correct/group 2-incorrect cases (b) to group 1-incorrect/group 2-correct cases (c), was calculated to evaluate the relative likelihood of success for MLLM-assisted searches compared to manual searches among discordant pairs. To address zero-cell counts, the Haldane-Anscombe correction (adding 0.5 to both b and c) was used for odds ratio estimation. Since the statistical comparisons between the 2 groups of interspecialty variability were exploratory, no <italic>P</italic> value correction was applied.</p><p>In addition to physician performance, the accuracy of HPO identification by MLLMs was analyzed. Hallucination, such as discrepancies between HPO terms and IDs or the generation of nonexistent terms, was observed. However, the accuracy and hallucination rates of standalone MLLMs were secondary observations and not formally tested.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Primary Outcome</title><p>The performance of the 2 physician groups in identifying correct HPO terms for the 27 patient images is summarized in <xref ref-type="table" rid="table1">Table 1</xref>. On average, 6.74 (SD 4.28) physicians in the MLLM-assisted group correctly described each image compared to 2.04 (SD 2.59) in the manual group.</p><p>Of the 270 descriptions collected from each group, the MLLM-assisted group achieved a correctness rate of 67.4% (182/270), while the manual group achieved 20.4% (55/270). Statistical analysis showed that the MLLM-assisted group&#x2019;s accuracy in describing HPO terms was significantly higher than that of the manual group (<italic>&#x03C7;</italic><sup>2</sup>=121.3; <italic>df=1,</italic> relative risk 3.31, 95% CI 2.58-4.25; <italic>P</italic>&#x003C;.001). A complete dataset of raw HPO term descriptions and final judgment outcomes is provided in Sheets 3 and 4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>To quantify the clinical impact of MLLM assistance, we calculated the absolute risk reduction and number needed to treat. The absolute risk reduction was 47%, indicating 47% increase in accuracy with MLLM support. The number needed to treat was approximately 3, meaning that for every 3 HPO terms described, MLLM assistance led to 1 additional correct description.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Correct counts of Human Phenotype Ontology (HPO) descriptions for 27 rare disease patient images by 2 groups of physicians<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">No</td><td align="left" valign="bottom">Rare disease</td><td align="left" valign="bottom">MLLM<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>-assisted (n=10), n (%)</td><td align="left" valign="bottom">Manual search (n=10), n (%)</td></tr></thead><tbody><tr><td align="left" valign="top">1</td><td align="left" valign="top">Albinism</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">2</td><td align="left" valign="top">Alport syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">6 (60)</td></tr><tr><td align="left" valign="top">3</td><td align="left" valign="top">Amyotrophic lateral sclerosis</td><td align="left" valign="top">5 (50)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">4</td><td align="left" valign="top">Angelman syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">5</td><td align="left" valign="top">Jeune syndrome</td><td align="left" valign="top">3 (30)</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top">6</td><td align="left" valign="top">Congenital scoliosis</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">9 (90)</td></tr><tr><td align="left" valign="top">7</td><td align="left" valign="top">Fabry disease</td><td align="left" valign="top">1 (10)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">8</td><td align="left" valign="top">Gaucher disease</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">9</td><td align="left" valign="top">Generalized myasthenia gravis</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">4 (40)</td></tr><tr><td align="left" valign="top">10</td><td align="left" valign="top">Hereditary angioedema</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">5 (50)</td></tr><tr><td align="left" valign="top">11</td><td align="left" valign="top">Marfan syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">4 (40)</td></tr><tr><td align="left" valign="top">12</td><td align="left" valign="top">McCune-Albright syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">6 (60)</td></tr><tr><td align="left" valign="top">13</td><td align="left" valign="top">Noonan syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">3 (30)</td></tr><tr><td align="left" valign="top">14</td><td align="left" valign="top">Peutz-Jeghers syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">2 (20)</td></tr><tr><td align="left" valign="top">15</td><td align="left" valign="top">POEMS<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup> syndrome</td><td align="left" valign="top">2 (20)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">16</td><td align="left" valign="top">Achondroplasia</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">17</td><td align="left" valign="top">Acromegaly</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">18</td><td align="left" valign="top">Adult-onset Still disease</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top">19</td><td align="left" valign="top">Alagille syndrome</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">20</td><td align="left" valign="top">Bardet-Biedl syndrome</td><td align="left" valign="top">4 (40)</td><td align="left" valign="top">6 (60)</td></tr><tr><td align="left" valign="top">21</td><td align="left" valign="top">Blue rubber bleb nevus</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">1 (10)</td></tr><tr><td align="left" valign="top">22</td><td align="left" valign="top">Cutaneous T-cell lymphomas</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">23</td><td align="left" valign="top">Fibrodysplasia ossificans progressiva</td><td align="left" valign="top">7 (70)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">24</td><td align="left" valign="top">Generalized pustular psoriasis</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">25</td><td align="left" valign="top">Hidradenitis suppurativa</td><td align="left" valign="top">0 (0)</td><td align="left" valign="top">0 (0)</td></tr><tr><td align="left" valign="top">26</td><td align="left" valign="top">Hutchinson-Gilford progeria syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">4 (40)</td></tr><tr><td align="left" valign="top">27</td><td align="left" valign="top">Lennox-Gastaut syndrome</td><td align="left" valign="top">10 (100)</td><td align="left" valign="top">3 (30)</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>The data in the table represent the number of physicians who correctly used HPO descriptions for the corresponding images.</p></fn><fn id="table1fn2"><p><sup>b</sup>MLLM: multimodal large language model.</p></fn><fn id="table1fn3"><p><sup>c</sup>POEMS: polyneuropathy, organomegaly, endocrinopathy, M-protein, skin changes.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-2"><title>Departmental Variability in HPO Description Accuracy</title><p>The participating physicians represented 10 departments (<xref ref-type="fig" rid="figure3">Figure 3</xref>). In the manual group, HPO description accuracy varied significantly across departments. Physicians from reproductive medicine, gynecology, and obstetrics achieved the highest accuracy rates, while those from orthopedics, internal medicine, and pediatrics had the lowest. In contrast, the MLLM-assisted group showed more consistent accuracy across all departments, with correct counts consistently higher than those in the manual group.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Accuracy of Human Phenotype Ontology (HPO) descriptions for patient images by junior physicians from different departments. Each department is represented by a single physician. In the manual search group, significant variability in the accuracy of HPO descriptions for patient images was observed among physicians from different departments. In contrast, in the MLLM-assisted group, all physicians achieved higher description accuracy compared to the manual search group, with more consistent performance levels across the group. MLLM: multimodal large language model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e73233_fig03.png"/></fig><p><xref ref-type="table" rid="table2">Table 2</xref> presents a comparison of interspecialty variability between the 2 groups, providing precise data to support <xref ref-type="fig" rid="figure3">Figure 3</xref>. Across all specialties, the MLLM-assisted group consistently demonstrated a higher description accuracy rate than the manual group, with statistically significant differences (<italic>P</italic>&#x003C;.05). This finding reinforces our primary conclusion. However, due to the limited number of participating physicians per specialty, this analysis remains exploratory, and results should be interpreted with caution.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of the interspecialty variability between the 2 groups.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Specialty</td><td align="left" valign="bottom">MLLM<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>-assisted (n=27), n (%)</td><td align="left" valign="bottom">Manual search (n=27), n (%)</td><td align="left" valign="bottom">OR<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (95% CI)<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">Reproductive medicine</td><td align="left" valign="top">18 (67)</td><td align="left" valign="top">11 (41)</td><td align="left" valign="top">5.7 (1.0-32.2)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">Gynecology</td><td align="left" valign="top">17 (63)</td><td align="left" valign="top">10 (37)</td><td align="left" valign="top">5.7 (1.0-32.2)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">Obstetrics</td><td align="left" valign="top">19 (70)</td><td align="left" valign="top">10 (37)</td><td align="left" valign="top">19.0 (1.1-326.5)</td><td align="left" valign="top">.04</td></tr><tr><td align="left" valign="top">Surgery</td><td align="left" valign="top">21 (78)</td><td align="left" valign="top">6 (22)</td><td align="left" valign="top">31.0 (1.9-518.1)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Radiology</td><td align="left" valign="top">18 (67)</td><td align="left" valign="top">4 (15)</td><td align="left" valign="top">29.0 (1.7-486.2)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Oncology</td><td align="left" valign="top">17 (63)</td><td align="left" valign="top">4 (15)</td><td align="left" valign="top">9.7 (1.8-51.9)</td><td align="left" valign="top">.001</td></tr><tr><td align="left" valign="top">Obstetrics and gynecology</td><td align="left" valign="top">20 (74)</td><td align="left" valign="top">4 (15)</td><td align="left" valign="top">33.0 (2.0-550.1)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Orthopedics</td><td align="left" valign="top">17 (63)</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">31.0 (1.9-518.1)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Internal medicine</td><td align="left" valign="top">18 (67)</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">33.0 (2.0-550.1)</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top">Pediatrics</td><td align="left" valign="top">17 (63)</td><td align="left" valign="top">2 (7)</td><td align="left" valign="top">11.0 (2.1-58.5)</td><td align="left" valign="top">&#x003C;.001</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>MLLM: multimodal large language model</p></fn><fn id="table2fn2"><p><sup>b</sup>OR: odds ratio.</p></fn><fn id="table2fn3"><p><sup>c</sup>Adjusted ORs and their 95% CI were reported to ensure stability in small-sample settings.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-3"><title>MLLMs Alone</title><p>The same methodology was used to evaluate the accuracy of HPO identification by standalone MLLMs without physician guidance. Among the 27 patient images, ChatGPT-4o achieved an accuracy rate of 48% (13/27), while Llama3.2:11b and Llama3.2:90b achieved rates of 15% (4/27) and 18% (5/27), respectively (<xref ref-type="table" rid="table3">Table 3</xref>).</p><p>Given the issue of hallucination in current LLMs, we reviewed the authenticity of the HPO terms generated (Sheet 6 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). ChatGPT-4o produced 117 HPO terms, each with an HPO ID. Verification revealed that 57.3% (67/117) had incorrect IDs, and 34.2% (40/117) were fabricated, as they could not be retrieved on the HPO website.</p><p>Similarly, Llama3.2:11b generated 63 HPO terms, with only 1 term having a correct ID. Thus, 98% (62/63) had mismatched IDs, and 41% (26/63) were fabricated. Llama3.2:90b, despite its larger parameter size, produced 56 HPO terms, of which 82% (46/56) had incorrect IDs, and 32% (18/56) were fabricated.</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Accuracy of Human Phenotype Ontology (HPO) identification and frequency of hallucinations in patient images by multimodal large language models.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Category</td><td align="left" valign="bottom">ChatGPT-4o, n/N (%)</td><td align="left" valign="bottom">Llama3.2:11b, n/N (%)</td><td align="left" valign="bottom">Llama3.2:90b, n/N (%)</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="4">Identification</td></tr><tr><td align="left" valign="top">&#x2003;Correct</td><td align="left" valign="top">13/27 (48)</td><td align="left" valign="top">4/27 (15)</td><td align="left" valign="top">5/27 (18)</td></tr><tr><td align="left" valign="top">&#x2003;Incorrect</td><td align="left" valign="top">14/27 (52)</td><td align="left" valign="top">23/27 (85)</td><td align="left" valign="top">22/27 (82)</td></tr><tr><td align="left" valign="top" colspan="4">Hallucinations<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Incorrect IDs</td><td align="left" valign="top">67/117 (57.3)</td><td align="left" valign="top">62/63 (98)</td><td align="left" valign="top">46/56 (82)</td></tr><tr><td align="left" valign="top">&#x2003;Fabricated terms</td><td align="left" valign="top">40/117 (34.2)</td><td align="left" valign="top">26/63 (41)</td><td align="left" valign="top">18/56 (32)</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>Hallucinations refer to model-generated HPO terms that either mismatch their IDs or are nonexistent.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-4"><title>Background Information on Rare Diseases for Junior Physicians</title><p>Following the patient description questionnaire, we recontacted the 20 junior physicians and conducted a survey on their rare disease background. The survey assessed years of clinical practice, annual encounters with patients with rare disease, and training in rare and genetic diseases, among other factors (Sheet 7 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). A baseline comparison of the junior physicians revealed a relatively balanced distribution between the 2 groups. The manual search group reported an average of 5.0 (SD 1.94) years of clinical practice compared to 5.3 (SD 2.11) years in the MLLM-assisted group. Physicians in the MLLM-assisted group encountered more patients with rare disease annually than those in the manual search group (mean 8.4, SD 5.72 vs mean 5.4, SD 3.06). The distribution of training and exposure to rare diseases, genetic diseases, and the HPO was also relatively balanced between groups. Due to the small sample size, no statistical tests were conducted.</p><p>To investigate why certain specialties outperformed others in the manual group, we ranked the physicians by accuracy rate and compared the top half (4 physicians, as the fifth physician&#x2019;s accuracy rate of 14.8% matched the next 3) with the bottom half (6 physicians). The analysis suggested that attendance at rare disease training (n=2, 50% vs n=1, 17%) and genetic disease training (n=4, 100% vs n=4, 67%) may contribute to higher accuracy, as these factors were more common among the top performers.</p><p>Finally, we surveyed the 20 physicians on the challenges they faced when describing patient phenotypes. Only 1 (5%) physician cited unfamiliarity with anatomy as the primary difficulty, 6 (30%) reported difficulty finding suitable HPO terms, and the majority, 13 (65%), identified both issues as barriers to accurate HPO retrieval.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This study evaluated the accuracy of MLLMs in identifying terms from the HPO. The findings revealed that the MLLM-assisted group achieved significantly higher accuracy in describing HPO terms for patient images associated with rare diseases compared with the manual search group (182/270, 67.4% vs 55/270, 20.4%). Notably, there was substantial variability in the accuracy of descriptions among physicians from different departments in the manual group. In contrast, the MLLM-assisted group demonstrated consistently high performance regardless of departmental differences. When the performance of the MLLMs was evaluated independently, ChatGPT-4o achieved a description accuracy rate of 48% (13/27), outperforming the open-source models Llama3.2:11b and Llama3.2:90b, which had accuracy rates of 15% (4/27) and 18% (5/27), respectively. These results indicate that the highest level of accuracy in identifying HPO terms can be achieved through collaboration between junior physicians and MLLMs, combining human expertise with the computational capabilities of these models. Despite their promise, a notable limitation of current MLLMs is their high rate of hallucination in generated HPO terms. Many terms produced by the models either did not match official HPO IDs or were entirely fabricated.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study has several limitations. First, the small sample size&#x2014;20 physicians and 27 patient images&#x2014;may limit the generalizability of the findings to broader clinical contexts. Second, the evaluation did not include several other MLLMs [<xref ref-type="bibr" rid="ref37">37</xref>-<xref ref-type="bibr" rid="ref39">39</xref>], which could have provided a more comprehensive performance comparison. Additionally, relying solely on publicly available patient images may not fully replicate real-world clinical phenotype description processes. There is also a possibility that some images were part of the models&#x2019; training datasets, potentially introducing bias. However, the study&#x2019;s primary aim was not to diagnose conditions but to assess MLLMs&#x2019; ability to identify phenotypes using HPO terms. Thus, the evaluation of their performance on unseen images remains reasonably valid. Finally, our study did not include senior physicians, especially those adept in HPO searches and possessing deep anatomical expertise, likely achieve greater baseline accuracy in manual identification. Consequently, the difference in performance between manual and MLLM-assisted approaches may be less pronounced for these experts. Nonetheless, as clinicians with extensive rare disease experience are rare, our study targeted junior physicians to represent the wider clinical landscape and highlight MLLM&#x2019;s potential to boost diagnostic precision.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>Previous studies have shown that LLMs like ChatGPT excel in recognizing phenotype concepts in natural language and [<xref ref-type="bibr" rid="ref10">10</xref>], with fine-tuning, outperform traditional tools in identifying HPO IDs [<xref ref-type="bibr" rid="ref40">40</xref>]. However, research on MLLMs for HPO recognition remains limited, and their broader medical potential is underexplored [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref33">33</xref>]. While MLLMs have demonstrated success in tasks like pathological image classification [<xref ref-type="bibr" rid="ref41">41</xref>] and chest radiograph diagnosis [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref44">44</xref>], their application to rare diseases is hindered by the lack of high-quality, disease-specific datasets [<xref ref-type="bibr" rid="ref45">45</xref>,<xref ref-type="bibr" rid="ref46">46</xref>]. Unlike earlier methods requiring extensive annotated medical images, ChatGPT-4o benefits from pretraining on diverse public datasets, making it particularly suitable for rare disease tasks. The survey on the rare disease knowledge of junior physicians indicates that attendance at rare disease and genetic disease training may contribute to improved performance among some physicians. This is plausible, as even in specialties with frequent exposure to patients with rare disease, physicians typically rely on nonstandardized natural language for descriptions and are confined to rare diseases within their own field. Training in rare and genetic diseases appears to enhance junior physicians&#x2019; ability to accurately describe patient phenotypes. However, even the highest-performing physicians in the manual search group achieved an accuracy rate of only 40.7%, which was surpassed by all physicians using MLLM assistance. The extensive anatomical knowledge embedded in MLLMs, combined with the provision of standardized HPO terms, provides valuable support for junior physicians. This assistance also mitigates the primary challenges most physicians reported in describing phenotypes.</p><p>Our study reveals that collaboration between junior physicians and ChatGPT-4o significantly enhances HPO description accuracy for rare diseases, outperforming both traditional methods and standalone MLLM use. This aligns with prior research showing the benefits of combining clinician expertise with visual-language models [<xref ref-type="bibr" rid="ref47">47</xref>]. Phenotype-driven diagnostic strategies for rare diseases are widely acknowledged as effective [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]; yet, clinicians often face challenges in accurately describing phenotypes using standardized HPO terms during initial consultations. The complexity of human anatomical variations [<xref ref-type="bibr" rid="ref48">48</xref>] and the extensive catalog of HPO terms pose significant barriers to precise documentation and communication. Our findings demonstrate that MLLM-assisted workflows streamline this process, improving accuracy and completeness over unaided methods. Additionally, MLLMs like ChatGPT-4o excel in processing contextual information, offering a clear advantage over traditional artificial intelligence tools [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. For instance, in this study, when ChatGPT-4o was provided with the key contextual detail that a patient with albinism is of a person of color, it correctly identified &#x201C;hypopigmentation of the skin&#x201D; as a significant HPO. Without this context, the model might have incorrectly assumed the patient to be White, potentially overlooking this crucial skin phenotype.</p><p>Current MLLMs face several challenges in phenotype recognition tasks. First, they often generate irrelevant HPO terms, which can mislead physicians. For example, when analyzing an image of a patient with Bardet-Biedl syndrome, the model failed to identify the standard HPO term &#x201C;polydactyly.&#x201D; This led to a higher error rate in the MLLM-assisted group (6/10 physicians) compared to the manual group (4/10). This anchoring effect, caused by the model&#x2019;s output, is unavoidable and can mislead some participants, diminishing their ability in certain specific diagnoses. While real-world clinical settings allow patients to clarify ambiguous findings, this example underscores the risk of models inadvertently misleading clinicians. Additionally, the accuracy of HPO identification remains suboptimal. ChatGPT-4o, the top-performing model, achieved only 48% (13/27) accuracy, with open-source models performing even worse. This highlights the early developmental stage of MLLMs in HPO identification and the significant performance gaps among models from different vendors [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref49">49</xref>,<xref ref-type="bibr" rid="ref50">50</xref>]. Moreover, as observed in prior research, the phenomenon of hallucination persists in MLLMs [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. These models frequently fabricate HPO terms or generate IDs that do not correspond to actual entries, further limiting their applicability in clinical workflows. Although open-source models are often preferred for patient privacy protection, their performance and hallucination rates in this study were concerning, likely due to limited pretraining on medical images and HPO-related materials [<xref ref-type="bibr" rid="ref33">33</xref>]. Even a large open-source model with 90 billion parameters lagged significantly behind ChatGPT-4o in phenotype identification. However, open-source models hold promise for future applications. With ongoing technological advancements and task-specific fine-tuning, their performance and reliability can be improved.</p><p>In this study, we contend that the risks posed by hallucinations in MLLMs for HPO identification are less severe than those associated with LLMs for direct diagnosis. We classified hallucinations into 2 categories: mismatched HPO IDs and fabricated, nonexistent HPO terms. When MLLMs are used solely to aid in describing patient features within medical records, these hallucinations are unlikely to exert a substantial adverse effect. In the context of integrating generated HPO terms with phenotype-driven diagnostic tools, an accurate HPO name ensures that mismatched IDs do not compromise the diagnostic process. Fabricated terms, being unrecognizable by such tools, may simply prompt researchers to identify correct terms or exclude the erroneous ones. Given that phenotype-driven tools provide reference-based diagnostic outcomes, with confirmatory diagnoses typically relying on genetic testing, this approach demonstrates greater clinical acceptability than direct LLM-based diagnosis.</p><p>While MLLMs can augment junior physicians&#x2019; ability to describe patient images, their outputs require careful application in clinical settings. Clinicians must leverage their expertise to distinguish reliable content from inaccuracies. Furthermore, when MLLM outputs contribute to medical record documentation, oversight by seasoned physicians is essential. Finally, for academic discourse or use in phenotype-driven diagnostic tools, generated HPO terms should be validated against the HPO database to ensure accuracy.</p></sec><sec id="s4-4"><title>Conclusions</title><p>The integration of MLLMs into clinical workflows demonstrates potential in enhancing junior physicians&#x2019; ability to describe disease phenotypes using standardized HPO terms. This collaborative approach surpasses standalone MLLMs, underscoring the added value of physician involvement. While open-source MLLMs show promise in phenotype identification, even advanced models like ChatGPT-4o face challenges such as identification errors and hallucinated outputs. Future efforts should focus on fine-tuning open-source MLLMs with expanded datasets of diverse phenotype images and HPO-related corpora. This strategy could improve model accuracy, reliability, and patient privacy, ultimately facilitating more precise use of HPO in clinical practice and medical research.</p></sec></sec></body><back><ack><p>The authors are grateful for the support of the following funds: National Key Research and Development Program of China (2023YFC2705600), Capital Clinical Characteristic Diagnosis and Treatment Technology Research and Transformation Application Project (Z221100007422012), Beijing Hospital Management Center &#x201C;Yangfan&#x201D; Plan 3.0 Clinical Technology Innovation Project (ZLRK202329), and Science and Technology Innovation and Transformation Special Project of Beijing Obstetrics and Gynecology Hospital Affiliated to Capital Medical University/Beijing Maternal and Child Health Hospital (FCYYZH202201).</p></ack><notes><sec><title>Data Availability</title><p>All data generated or analyzed during this study are included in this published paper (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p></sec></notes><fn-group><fn fn-type="con"><p>WZ wrote the manuscript and created the figures. WZ, MS, SY, DP, and HG collected images of patients with rare diseases; WZ, MS, SY, DP, HG, HY, and WH gathered information generated by multimodal large language models; YY, YiFan Liu, Yan Liu, KY, and WZ participated in establishing the gold standard Human Phenotype Ontology (HPO) terms for the study; WZ, MS, and SY were responsible for distributing questionnaires to junior physicians and compiling the responses. WZ collected and analyzed the data for <xref ref-type="table" rid="table1">Table 1</xref>; WZ and MS collected and analyzed the data for <xref ref-type="table" rid="table2">Table 2</xref>; WZ, MS, and SY collected and analyzed the data for <xref ref-type="table" rid="table3">Table 3</xref>. WZ and DP were in charge of the statistical methods for the study. YY and CY were responsible for the study design, conceptualization, supervision, and funding acquisition. All authors reviewed the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CHPO</term><def><p>Chinese Human Phenotype Ontology</p></def></def-item><def-item><term id="abb2">HPO</term><def><p>Human Phenotype Ontology</p></def></def-item><def-item><term id="abb3">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb4">MLLM</term><def><p>multimodal large language model</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gargano</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Matentzoglu</surname><given-names>N</given-names> </name><name name-style="western"><surname>Coleman</surname><given-names>B</given-names> </name><etal/></person-group><article-title>The Human Phenotype Ontology in 2024: phenotypes around the world</article-title><source>Nucleic Acids Res</source><year>2024</year><month>01</month><day>5</day><volume>52</volume><issue>D1</issue><fpage>D1333</fpage><lpage>D1346</lpage><pub-id pub-id-type="doi">10.1093/nar/gkad1005B2</pub-id><pub-id pub-id-type="medline">37953324</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>K&#x00F6;hler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Carmody</surname><given-names>L</given-names> </name><name name-style="western"><surname>Vasilevsky</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Expansion of the Human Phenotype Ontology (HPO) knowledge base and resources</article-title><source>Nucleic Acids Res</source><year>2019</year><month>01</month><day>8</day><volume>47</volume><issue>D1</issue><fpage>D1018</fpage><lpage>D1027</lpage><pub-id pub-id-type="doi">10.1093/nar/gky1105B2</pub-id><pub-id pub-id-type="medline">30476213</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>K&#x00F6;hler</surname><given-names>S</given-names> </name><name name-style="western"><surname>Vasilevsky</surname><given-names>NA</given-names> </name><name name-style="western"><surname>Engelstad</surname><given-names>M</given-names> </name><etal/></person-group><article-title>The Human Phenotype Ontology in 2017</article-title><source>Nucleic Acids Res</source><year>2017</year><month>01</month><day>4</day><volume>45</volume><issue>D1</issue><fpage>D865</fpage><lpage>D876</lpage><pub-id pub-id-type="doi">10.1093/nar/gkw1039B2</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pontikos</surname><given-names>N</given-names> </name><name name-style="western"><surname>Murphy</surname><given-names>C</given-names> </name><name name-style="western"><surname>Moghul</surname><given-names>I</given-names> </name><etal/></person-group><article-title>Phenogenon: gene to phenotype associations for rare genetic diseases</article-title><source>PLoS ONE</source><year>2020</year><volume>15</volume><issue>4</issue><fpage>e0230587</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0230587B3</pub-id><pub-id pub-id-type="medline">32271766</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhai</surname><given-names>W</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>N</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>S</given-names> </name></person-group><article-title>Phen2Disease: a phenotype-driven model for disease and gene prioritization by bidirectional maximum matching semantic similarities</article-title><source>Brief Bioinform</source><year>2023</year><month>07</month><day>20</day><volume>24</volume><issue>4</issue><fpage>bbad172</fpage><pub-id pub-id-type="doi">10.1093/bib/bbad172B2</pub-id><pub-id pub-id-type="medline">37248747</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dhombres</surname><given-names>F</given-names> </name><name name-style="western"><surname>Morgan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chaudhari</surname><given-names>BP</given-names> </name><etal/></person-group><article-title>Prenatal phenotyping: a community effort to enhance the Human Phenotype Ontology</article-title><source>American J Med Genetics Pt C</source><year>2022</year><month>06</month><volume>190</volume><issue>2</issue><fpage>231</fpage><lpage>242</lpage><pub-id pub-id-type="doi">10.1002/ajmg.c.31989B3</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Robinson</surname><given-names>PN</given-names> </name><name name-style="western"><surname>Ravanmehr</surname><given-names>V</given-names> </name><name name-style="western"><surname>Jacobsen</surname><given-names>JOB</given-names> </name><etal/></person-group><article-title>Interpretable clinical genomics with a likelihood ratio paradigm</article-title><source>Am J Hum Genet</source><year>2020</year><month>09</month><day>3</day><volume>107</volume><issue>3</issue><fpage>403</fpage><lpage>417</lpage><pub-id pub-id-type="doi">10.1016/j.ajhg.2020.06.021B1</pub-id><pub-id pub-id-type="medline">32755546</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="web"><article-title>Face2Gene is a suite of phenotyping applications that facilitate comprehensive and precise genetic evaluations</article-title><source>Face2Gene</source><access-date>2025-01-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.face2gene.com/">https://www.face2gene.com/</ext-link></comment></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Exomiser: a tool to annotate and prioritize exome variants</article-title><source>GitHub</source><access-date>2025-01-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/exomiser/Exomiser">https://github.com/exomiser/Exomiser</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Groza</surname><given-names>T</given-names> </name><name name-style="western"><surname>Caufield</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gration</surname><given-names>D</given-names> </name><etal/></person-group><article-title>An evaluation of GPT models for phenotype concept recognition</article-title><source>BMC Med Inform Decis Mak</source><year>2024</year><month>01</month><day>31</day><volume>24</volume><issue>1</issue><fpage>30</fpage><pub-id pub-id-type="doi">10.1186/s12911-024-02439-wB3</pub-id><pub-id pub-id-type="medline">38297371</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="web"><article-title>Human Phenotype Ontology</article-title><source>Monarch Initiative</source><access-date>2025-01-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://hpo.jax.org/">https://hpo.jax.org/</ext-link></comment></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>McNeill</surname><given-names>A</given-names> </name></person-group><article-title>Good genotype-phenotype relationships in rare disease are hard to find</article-title><source>Eur J Hum Genet</source><year>2022</year><month>03</month><volume>30</volume><issue>3</issue><fpage>251</fpage><pub-id pub-id-type="doi">10.1038/s41431-022-01062-5B2</pub-id><pub-id pub-id-type="medline">35260823</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jacobsen</surname><given-names>JOB</given-names> </name><name name-style="western"><surname>Kelly</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cipriani</surname><given-names>V</given-names> </name><etal/></person-group><article-title>Phenotype-driven approaches to enhance variant prioritization and diagnosis of rare disease</article-title><source>Hum Mutat</source><year>2022</year><month>08</month><volume>43</volume><issue>8</issue><fpage>1071</fpage><lpage>1081</lpage><pub-id pub-id-type="doi">10.1002/humu.24380B2</pub-id><pub-id pub-id-type="medline">35391505</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kelly</surname><given-names>C</given-names> </name><name name-style="western"><surname>Szabo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pontikos</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Phenotype-aware prioritisation of rare Mendelian disease variants</article-title><source>Trends Genet</source><year>2022</year><month>12</month><volume>38</volume><issue>12</issue><fpage>1271</fpage><lpage>1283</lpage><pub-id pub-id-type="doi">10.1016/j.tig.2022.07.002B2</pub-id><pub-id pub-id-type="medline">35934592</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lagorce</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lebreton</surname><given-names>E</given-names> </name><name name-style="western"><surname>Matalonga</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Phenotypic similarity-based approach for variant prioritization for unsolved rare disease: a preliminary methodological report</article-title><source>Eur J Hum Genet</source><year>2024</year><month>02</month><volume>32</volume><issue>2</issue><fpage>182</fpage><lpage>189</lpage><pub-id pub-id-type="doi">10.1038/s41431-023-01486-7B2</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>ChatGPT</article-title><source>OpenAI</source><access-date>2025-01-10</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://chatgpt.com">https://chatgpt.com</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8B1</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Xie</surname><given-names>W</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name></person-group><article-title>Knowledge-enhanced visual-language pre-training on chest radiology images</article-title><source>Nat Commun</source><year>2023</year><month>07</month><day>28</day><volume>14</volume><issue>1</issue><pub-id pub-id-type="doi">10.1038/s41467-023-40260-7B1</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moor</surname><given-names>M</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>O</given-names> </name><name name-style="western"><surname>Abad</surname><given-names>ZSH</given-names> </name><etal/></person-group><article-title>Foundation models for generalist medical artificial intelligence</article-title><source>Nature New Biol</source><year>2023</year><month>04</month><volume>616</volume><issue>7956</issue><fpage>259</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-05881-4B1</pub-id><pub-id pub-id-type="medline">37045921</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659B2</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pushpanathan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Lim</surname><given-names>ZW</given-names> </name><name name-style="western"><surname>Er Yew</surname><given-names>SM</given-names> </name><etal/></person-group><article-title>Popular large language model chatbots&#x2019; accuracy, comprehensiveness, and self-awareness in answering ocular symptom queries</article-title><source>iScience</source><year>2023</year><month>11</month><day>17</day><volume>26</volume><issue>11</issue><fpage>108163</fpage><pub-id pub-id-type="doi">10.1016/j.isci.2023.108163B2</pub-id><pub-id pub-id-type="medline">37915603</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>C</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>S</given-names> </name></person-group><article-title>Utility of ChatGPT in clinical practice</article-title><source>J Med Internet Res</source><year>2023</year><volume>25</volume><fpage>e48568</fpage><pub-id pub-id-type="doi">10.2196/48568B2</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Laios</surname><given-names>A</given-names> </name><name name-style="western"><surname>Theophilou</surname><given-names>G</given-names> </name><name name-style="western"><surname>De Jong</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kalampokis</surname><given-names>E</given-names> </name></person-group><article-title>The future of AI in ovarian cancer research: the large language models perspective</article-title><source>Cancer Control</source><year>2023</year><volume>30</volume><fpage>10732748231197915</fpage><pub-id pub-id-type="doi">10.1177/10732748231197915B4</pub-id><pub-id pub-id-type="medline">37624621</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hirosawa</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mizuta</surname><given-names>K</given-names> </name><name name-style="western"><surname>Harada</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>T</given-names> </name></person-group><article-title>Comparative evaluation of diagnostic accuracy between Google Bard and physicians</article-title><source>Am J Med</source><year>2023</year><month>11</month><volume>136</volume><issue>11</issue><fpage>1119</fpage><lpage>1123</lpage><pub-id pub-id-type="doi">10.1016/j.amjmed.2023.08.003B3</pub-id><pub-id pub-id-type="medline">37643659</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hermann</surname><given-names>CE</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Boyd</surname><given-names>L</given-names> </name><name name-style="western"><surname>Growdon</surname><given-names>WB</given-names> </name><name name-style="western"><surname>Aviki</surname><given-names>E</given-names> </name><name name-style="western"><surname>Stasenko</surname><given-names>M</given-names> </name></person-group><article-title>Let&#x2019;s chat about cervical cancer: assessing the accuracy of ChatGPT responses to cervical cancer questions</article-title><source>Gynecol Oncol</source><year>2023</year><month>12</month><volume>179</volume><fpage>164</fpage><lpage>168</lpage><pub-id pub-id-type="doi">10.1016/j.ygyno.2023.11.008B2</pub-id><pub-id pub-id-type="medline">37988948</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chervenak</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lieman</surname><given-names>H</given-names> </name><name name-style="western"><surname>Blanco-Breindel</surname><given-names>M</given-names> </name><name name-style="western"><surname>Jindal</surname><given-names>S</given-names> </name></person-group><article-title>The promise and peril of using a large language model to obtain clinical information: ChatGPT performs strongly as a fertility counseling tool with limitations</article-title><source>Fertil Steril</source><year>2023</year><month>09</month><volume>120</volume><issue>3</issue><fpage>575</fpage><lpage>583</lpage><pub-id pub-id-type="doi">10.1016/j.fertnstert.2023.05.151B1</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><name name-style="western"><surname>Gottweis</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Toward expert-level medical question answering with large language models</article-title><source>Nat Med</source><year>2025</year><month>03</month><volume>31</volume><issue>3</issue><fpage>943</fpage><lpage>950</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03423-7B1</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>M</given-names> </name><name name-style="western"><surname>Qian</surname><given-names>J</given-names> </name><name name-style="western"><surname>Pan</surname><given-names>S</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Qiu</surname><given-names>RLJ</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>X</given-names> </name></person-group><article-title>Advancing medical imaging with language models: featuring a spotlight on ChatGPT</article-title><source>Phys Med Biol</source><year>2024</year><month>05</month><day>3</day><volume>69</volume><issue>10</issue><fpage>10TR01</fpage><pub-id pub-id-type="doi">10.1088/1361-6560/ad387dB3</pub-id><pub-id pub-id-type="medline">38537293</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riepenhausen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plagwitz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Varghese</surname><given-names>J</given-names> </name></person-group><article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title><source>Nat Commun</source><year>2024</year><month>03</month><day>6</day><volume>15</volume><issue>1</issue><fpage>2050</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-46411-8B1</pub-id><pub-id pub-id-type="medline">38448475</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goh</surname><given-names>E</given-names> </name><name name-style="western"><surname>Gallo</surname><given-names>R</given-names> </name><name name-style="western"><surname>Hom</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Large language model influence on diagnostic reasoning: a randomized clinical trial</article-title><source>JAMA Netw Open</source><year>2024</year><month>10</month><day>1</day><volume>7</volume><issue>10</issue><fpage>e2440969</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2024.40969B1</pub-id><pub-id pub-id-type="medline">39466245</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Qiu</surname><given-names>P</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><etal/></person-group><article-title>Towards building multilingual language model for medicine</article-title><source>Nat Commun</source><year>2024</year><month>09</month><day>27</day><volume>15</volume><issue>1</issue><fpage>8384</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-52417-zB1</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Suh</surname><given-names>PS</given-names> </name><name name-style="western"><surname>Shim</surname><given-names>WH</given-names> </name><name name-style="western"><surname>Suh</surname><given-names>CH</given-names> </name><etal/></person-group><article-title>Comparing large language model and human reader accuracy with New England Journal of Medicine Image Challenge case image inputs</article-title><source>Radiology</source><year>2024</year><month>12</month><volume>313</volume><issue>3</issue><fpage>e241668</fpage><pub-id pub-id-type="doi">10.1148/radiol.241668B1</pub-id><pub-id pub-id-type="medline">39656125</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nakao</surname><given-names>T</given-names> </name><name name-style="western"><surname>Miki</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nakamura</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Capability of GPT-4V(ision) in the Japanese National Medical Licensing Examination: evaluation study</article-title><source>JMIR Med Educ</source><year>2024</year><month>03</month><day>12</day><volume>10</volume><fpage>e54393</fpage><pub-id pub-id-type="doi">10.2196/54393NA</pub-id><pub-id pub-id-type="medline">38470459</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><source>National Rare Diseases Registry System of China</source><access-date>2024-12-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nrdrs.org.cn/xhrareweb/homeIndex">https://www.nrdrs.org.cn/xhrareweb/homeIndex</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="web"><article-title>Open Access Biomedical Image Search Engine</article-title><source>National Library of Medicine</source><access-date>2025-01-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openi.nlm.nih.gov/">https://openi.nlm.nih.gov/</ext-link></comment></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="web"><source>CHPO</source><access-date>2025-01-14</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.chinahpo.net/chpo/#/search">https://www.chinahpo.net/chpo/#/search</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="web"><article-title>Kimi.ai</article-title><source>Moonshot</source><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://kimi.moonshot.cn/">https://kimi.moonshot.cn/</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name></person-group><article-title>haotian-liu/LLaVA</article-title><source>GitHub</source><year>2025</year><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/haotian-liu/LLaVA">https://github.com/haotian-liu/LLaVA</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="web"><article-title>Claude</article-title><source>Anthropic</source><access-date>2025-01-15</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://claude.ai">https://claude.ai</ext-link></comment></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>A</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>C</given-names> </name></person-group><article-title>Fine-tuning large language models for rare disease concept normalization</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>2076</fpage><lpage>2083</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae133B2</pub-id><pub-id pub-id-type="medline">38829731</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Bianchi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Yuksekgonul</surname><given-names>M</given-names> </name><name name-style="western"><surname>Montine</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Zou</surname><given-names>J</given-names> </name></person-group><article-title>A visual-language foundation model for pathology image analysis using medical Twitter</article-title><source>Nat Med</source><year>2023</year><month>09</month><volume>29</volume><issue>9</issue><fpage>2307</fpage><lpage>2316</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02504-3B1</pub-id><pub-id pub-id-type="medline">37592105</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Kobayashi</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Interpretable medical image Visual Question Answering via multi-modal relationship graph learning</article-title><source>Med Image Anal</source><year>2024</year><month>10</month><volume>97</volume><fpage>103279</fpage><pub-id pub-id-type="doi">10.1016/j.media.2024.103279B1</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roos</surname><given-names>J</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>R</given-names> </name><name name-style="western"><surname>Kaczmarczyk</surname><given-names>R</given-names> </name></person-group><article-title>Evaluating Bard Gemini Pro and GPT-4 Vision against student performance in medical Visual Question Answering: comparative case study</article-title><source>JMIR Form Res</source><year>2024</year><month>12</month><day>17</day><volume>8</volume><fpage>e57592</fpage><pub-id pub-id-type="doi">10.2196/57592NA</pub-id><pub-id pub-id-type="medline">39714199</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mahjoubi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Shahabi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sheikhbahaei</surname><given-names>S</given-names> </name><name name-style="western"><surname>Jazi</surname><given-names>AHD</given-names> </name></person-group><article-title>Evaluating AI capabilities in bariatric surgery: a study on ChatGPT-4 and DALL&#x00B7;E 3&#x2019;s recognition and illustration accuracy</article-title><source>Obes Surg</source><year>2025</year><month>02</month><volume>35</volume><issue>2</issue><fpage>638</fpage><lpage>641</lpage><pub-id pub-id-type="doi">10.1007/s11695-024-07653-zB3</pub-id><pub-id pub-id-type="medline">39733375</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lapidus</surname><given-names>D</given-names> </name></person-group><article-title>Strengths and limitations of new artificial intelligence tool for rare disease epidemiology</article-title><source>J Transl Med</source><year>2023</year><month>04</month><day>30</day><volume>21</volume><issue>1</issue><fpage>292</fpage><pub-id pub-id-type="doi">10.1186/s12967-023-04152-0B2</pub-id><pub-id pub-id-type="medline">37122037</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>J</given-names> </name><name name-style="western"><surname>Taroni</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Allaway</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Prasad</surname><given-names>DV</given-names> </name><name name-style="western"><surname>Guinney</surname><given-names>J</given-names> </name><name name-style="western"><surname>Greene</surname><given-names>C</given-names> </name></person-group><article-title>Machine learning in rare disease</article-title><source>Nat Methods</source><year>2023</year><month>06</month><volume>20</volume><issue>6</issue><fpage>803</fpage><lpage>814</lpage><pub-id pub-id-type="doi">10.1038/s41592-023-01886-zB1</pub-id><pub-id pub-id-type="medline">37248386</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tanno</surname><given-names>R</given-names> </name><name name-style="western"><surname>Barrett</surname><given-names>DGT</given-names> </name><name name-style="western"><surname>Sellergren</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Collaboration between clinicians and vision-language models in radiology report generation</article-title><source>Nat Med</source><year>2025</year><month>02</month><volume>31</volume><issue>2</issue><fpage>599</fpage><lpage>608</lpage><pub-id pub-id-type="doi">10.1038/s41591-024-03302-1B1</pub-id><pub-id pub-id-type="medline">39511432</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nzenwa</surname><given-names>IC</given-names> </name><name name-style="western"><surname>Iqbal</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Bazira</surname><given-names>PJ</given-names> </name></person-group><article-title>Exploring the inclusion of anatomical variation in medical education</article-title><source>Anat Sci Educ</source><year>2023</year><volume>16</volume><issue>3</issue><fpage>531</fpage><lpage>546</lpage><pub-id pub-id-type="doi">10.1002/ase.2254B2</pub-id><pub-id pub-id-type="medline">36637969</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Koga</surname><given-names>S</given-names> </name><name name-style="western"><surname>Martin</surname><given-names>NB</given-names> </name><name name-style="western"><surname>Dickson</surname><given-names>DW</given-names> </name></person-group><article-title>Evaluating the performance of large language models: ChatGPT and Google Bard in generating differential diagnoses in clinicopathological conferences of neurodegenerative disorders</article-title><source>Brain Pathol</source><year>2024</year><month>05</month><volume>34</volume><issue>3</issue><fpage>e13207</fpage><pub-id pub-id-type="doi">10.1111/bpa.13207B2</pub-id><pub-id pub-id-type="medline">37553205</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Diagnostic performance of ChatGPT-4o and four open-source large language models on China&#x2019;s rare disease catalog: comparative study</article-title><source>JMIR Preprints</source><comment>Preprint posted online on  Dec 11, 2025</comment><pub-id pub-id-type="doi">10.2196/preprints.69929</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The supplementary material contains 7 sheets, providing detailed information about the study. The specific content of each sheet is as follows: Sheet 1. Information on Open-Access images used for phenotype descriptions; Sheet 2. Chat logs output by multimodal large language models; Sheet 3. HPO terms filled in the questionnaires by the 2 Groups of Junior Physicians; Sheet 4. Accuracy judgments of HPO terms in the questionnaires; Sheet 5. Standard HPO answers established by the expert panel; Sheet 6. Hallucination of multimodal large language models; Sheet 7. Background information on rare diseases for junior physicians.</p><media xlink:href="jmir_v27i1e73233_app1.xlsx" xlink:title="XLSX File, 86 KB"/></supplementary-material></app-group></back></article>