<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e91399</article-id><article-id pub-id-type="doi">10.2196/91399</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Adaptive Fast-Slow Large Language Model Framework for Multidimensional Classification of Prenatal Ultrasound Reports: Comparative Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Zhong</surname><given-names>Wei</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yan</surname><given-names>Huihui</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Yifan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Yan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yang</surname><given-names>Kai</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Gao</surname><given-names>Huimin</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Yao</surname><given-names>Zhengyang</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Hao</surname><given-names>Wenjing</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yan</surname><given-names>Yousheng</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" corresp="yes" equal-contrib="yes"><name name-style="western"><surname>Yin</surname><given-names>Chenghong</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib></contrib-group><aff id="aff1"><institution>Department of Medical Genetics, Beijing Obstetrics and Gynecology Hospital, Capital Medical University. Beijing Maternal and Child Health Care Hospital</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff2"><institution>Department of Prenatal Diagnosis Center, Beijing Obstetrics and Gynecology Hospital, Capital Medical University. Beijing Maternal and Child Health Care Hospital</institution><addr-line>Beijing</addr-line><country>China</country></aff><aff id="aff3"><institution>Department of Central Laboratory, Beijing Obstetrics and Gynecology Hospital, Capital Medical University. Beijing Maternal and Child Health Care Hospital</institution><addr-line>No. 251 Yaojiayuan Road, Chaoyang District</addr-line><addr-line>Beijing</addr-line><country>China</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Balcarras</surname><given-names>Matthew</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Lin</surname><given-names>Ling</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Li</surname><given-names>Zhi</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Chenghong Yin, MD, Department of Central Laboratory, Beijing Obstetrics and Gynecology Hospital, Capital Medical University. Beijing Maternal and Child Health Care Hospital, No. 251 Yaojiayuan Road, Chaoyang District, Beijing, 100026, China, 86 15572779093; <email>yinchh@ccmu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>28</day><month>5</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e91399</elocation-id><history><date date-type="received"><day>14</day><month>01</month><year>2026</year></date><date date-type="rev-recd"><day>02</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>04</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9;Wei Zhong, Huihui Yan, Yifan Liu, Yan Liu, Kai Yang, Huimin Gao, Zhengyang Yao, Wenjing Hao, Yousheng Yan, Chenghong Yin. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 28.5.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e91399"/><abstract><sec><title>Background</title><p>Phenotype-driven prenatal diagnosis relies on the precise correlation between ultrasound findings and genetic outcomes; however, this process is hindered by the unstructured nature of clinical ultrasound reports. While large language models (LLMs) hold the potential to address this challenge, their specific application in this domain remains systematically underexplored.</p></sec><sec><title>Objective</title><p>To establish an effective LLM implementation framework for the clinical multidimensional classification of prenatal ultrasound reports, we evaluated the open-source DeepSeek-V3.2 family on real-world anomalous reports&#x2014;covering both factual and subjective categories&#x2014;while integrating retrieval-augmented generation (RAG) and chain-of-thought (CoT) reasoning.</p></sec><sec sec-type="methods"><title>Methods</title><p>From a cohort of 4256 pregnancies, we extracted 254 reports with fetal anomalies. We comprehensively evaluated both the high-speed base model (DeepSeek-V3.2-B) and the reasoning-enhanced model (DeepSeek-V3.2-R) across all 5 classification dimensions, comprising 4 factual extraction tasks&#x2014;primary classification, standardized terminology, anatomical system, and abnormality count&#x2014;and 1 subjective severity assessment. We further explicitly evaluated the efficacy of RAG for the subjective tasks. Finally, to validate the clinical utility of this approach, we performed a correlation analysis between the expert-validated multidimensional phenotypic profiles and definitive genetic outcomes derived from amniocentesis.</p></sec><sec sec-type="results"><title>Results</title><p>While V3.2-B achieved high efficiency in factual tasks (accuracy and <italic>F</italic><sub>1</sub>-score &#x003E;90%), it underperformed in subjective severity grading (56.6% accuracy), exhibiting a recall of 0 for minor anomalies. Crucially, while RAG significantly improved both models&#x2019; performance on internal retrieval datasets (<italic>P</italic>&#x003C;.05), this benefit did not generalize to external test datasets (<italic>P</italic>&#x003E;.05). In contrast, the V3.2-R model utilizing CoT reasoning achieved superior robustness (86% accuracy and <italic>F</italic><sub>1</sub>-score=0.75) on external data without RAG; notably, introducing RAG to V3.2-R degraded performance to 81%, suggesting potential noise interference. Clinical validation against amniocentesis outcomes confirmed that accurate multidimensional phenotypic profiles significantly stratified pathogenic genetic risks.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>The rapid base models are efficient for factual classification, and RAG enhances performance on data similar to the knowledge base, whereas CoT is indispensable for subjective assessment. Within the constraints of our dataset and current retrieval implementation, CoT proved more robust than RAG for subjective assessment. However, this finding is specifically tied to our experimental setup and should not be generalized as a universal conclusion. We recommend clinically adopting this adaptive &#x201C;fast-slow&#x201D; LLM framework to efficiently perform the multidimensional classification of prenatal ultrasound anomalies. This privacy-preserving, locally deployable solution provides a scalable path to accelerate phenotype-genotype research and optimize invasive diagnostic decision-making.</p></sec></abstract><kwd-group><kwd>large language models</kwd><kwd>prenatal ultrasound</kwd><kwd>DeepSeek</kwd><kwd>phenotype-driven diagnosis</kwd><kwd>chain-of-thought</kwd><kwd>retrieval-augmented generation</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Prenatal ultrasound is fundamental for fetal assessment, but efficiently transforming these narrative reports into structured data for clinical decision-support remains a significant challenge [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. While amniocentesis provides definitive genetic diagnoses, its associated risks of miscarriage and infection [<xref ref-type="bibr" rid="ref3">3</xref>] make it unsuitable for universal application to all cases with abnormal ultrasound findings. Crucially, the decision to pursue invasive testing relies on a nuanced, multidimensional risk assessment rather than a singular finding. Different classification dimensions&#x2014;specifically the affected anatomical system, the count of anomalies (isolated vs multiple), and the severity grading&#x2014;each carry distinct predictive weights regarding chromosomal outcomes. For instance, multisystem defects or lethal malformations suggest a significantly higher genetic risk compared to isolated soft markers.</p><p>Therefore, a comprehensive integration of these multidimensional classifications is essential to provide patients with data-driven counseling, enabling them to weigh the probability of a genetic disorder against the procedural risks of amniocentesis. However, the large-scale analysis required to validate and refine this multidimensional risk stratification is hindered by the nature of clinical documentation: the frequent occurrence of benign anomalies and inconsistent descriptive terminology [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref7">7</xref>] creates unstructured &#x201C;data silos.&#x201D; Establishing a standardized framework to correlate these specific sonographic phenotypes with genetic outcomes is vital; yet, it has been hindered by the labor-intensive, expert-dependent process of annotating large-scale datasets.</p><p>The recent emergence of high-performance, open-source large language models (LLMs) offers a potential solution to this clinical natural language processing bottleneck [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref11">11</xref>]. However, the deployment of LLMs in medicine faces the critical challenge of hallucinations [<xref ref-type="bibr" rid="ref12">12</xref>]. To mitigate this, retrieval-augmented generation (RAG) has become the prevailing paradigm, anchoring model outputs to external knowledge bases to ensure factual accuracy [<xref ref-type="bibr" rid="ref13">13</xref>]. However, RAG primarily enhances information retrieval rather than logical reasoning. For complex, subjective tasks&#x2014;such as assessing the severity of fetal anomalies based on subtle descriptive nuances&#x2014;access to external knowledge may be insufficient without the capacity for deep reasoning. This has catalyzed the development of chain-of-thought (CoT) reasoning models [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref14">14</xref>], which mimic the deliberate &#x201C;System 2&#x201D; thinking popularized by Daniel Kahneman [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] by decomposing complex problems into intermediate logical steps. Unlike proprietary models, modern open-source LLMs, such as DeepSeek-V3.2, can be securely deployed within hospital environments. The V3.2 iteration uniquely offers both a high-speed base model (V3.2-B) and a reasoning-enhanced variant (V3.2-R), presenting a new opportunity to address the varying complexity of medical tasks&#x2014;ranging from routine information extraction to complex logic-based severity assessment&#x2014;within a unified local framework.</p><p>Despite these technological advances, the comparative effectiveness of retrieval-based versus reasoning-based approaches in automating the analysis of prenatal ultrasound reports remains largely unexplored. This study proposes an adaptive &#x201C;fast-slow&#x201D; LLM framework to address the multidimensional complexity of fetal phenotype extraction. We utilized the DeepSeek-V3.2 suite to evaluate the trade-offs between the fast base model (V3.2-B) and the slow reasoning model (V3.2-R), specifically assessing the utility of RAG versus CoT in subjective severity grading. To establish the clinical validity of this multidimensional classification, we further analyzed the association between expert-verified results and &#x201C;gold standard&#x201D; genetic outcomes from amniocentesis. Our objective was to demonstrate that accurate, multidimensional profiling is strongly predictive of pathogenic risks, and that while the fast base LLM and RAG suffice for factual tasks, CoT reasoning is indispensable for automating the subjective components of this profile, thereby accelerating phenotype-driven diagnosis.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Patient Recruitment and Data Collection</title><p>Between January 2023 and July 2024, 4256 pregnant women underwent an enhanced noninvasive prenatal test (NIPT2.0) as part of a longitudinal clinical efficacy validation study. The inclusion criteria were singleton pregnancies between 12 and 20 weeks of gestation, while excluding those with conditions known to significantly interfere with the NIPT2.0 analysis. This specific gestational window was deliberately selected because it aligns with the optimal and most actionable clinical timeframe for evaluating the necessity of invasive amniocentesis. Although this specific window inherently excludes late-onset anomalies presenting in the third trimester, it precisely captures the most critical period for phenotype-driven genetic risk assessment. Furthermore, because the NIPT2.0 screening&#x2014;which covers both common aneuploidies and select monogenic disorders [<xref ref-type="bibr" rid="ref17">17</xref>]&#x2014;was offered free of charge, participation rates were exceptionally high. This provided a broad and highly representative real-world sample of early-to-mid second-trimester ultrasound findings, laying a solid data foundation for the downstream phenotype-genetic correlation analysis. Maternal age, family history, and prenatal ultrasound reports were collected at enrollment. Participants with negative (low-risk) NIPT2.0 results received routine follow-up, while those with positive (high-risk) results were advised to undergo amniocentesis for definitive diagnosis. The manual review of all records identified 254 participants with ultrasound reports showing anomalies of the fetus, placenta, umbilical cord, or amniotic fluid. These 254 reports constituted the dataset for this analysis. The study workflow is depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the study design. The workflow comprises three stages: (1) cohort construction: the selection of 254 abnormal fetal ultrasound reports from 4256 screenings. (2) The adaptive &#x201C;fast-slow&#x201D; framework: a dual-pathway system deploying DeepSeek-V3.2-B (fast) for objective factual extraction and DeepSeek-V3.2-R (slow) for subjective severity assessment. This phase explicitly compares the efficacy of RAG versus CoT reasoning. (3) Clinical validation: the dual assessment of the structured output against expert consensus (technical accuracy) and amniocentesis genetic outcomes (clinical utility). CoT: chain-of-thought; LLMs: large language models; NIPT2.0: noninvasive prenatal test; RAG: retrieval-augmented generation; V3.2-B: DeepSeek-V3.2 base model; V3.2-R: DeepSeek-V3.2 reasoning-enhanced model.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e91399_fig01.png"/></fig></sec><sec id="s2-2"><title>Multidimensional Classification of Prenatal Ultrasound Reports by LLMs</title><p>The 254 selected abnormal prenatal ultrasound reports were organized into a spreadsheet format. We utilized Dify (LangGenius, Inc) [<xref ref-type="bibr" rid="ref18">18</xref>] to call the application programming interface of DeepSeek-V3.2, provided by the SiliconCloud service [<xref ref-type="bibr" rid="ref19">19</xref>], to perform the automated classification. Each report was analyzed individually in a single-turn conversation with model parameters set to temperature=0 and top-p=0.95.</p><p>Five classification schemes were developed to structure the raw report data, including 4 fact-based classifications and 1 subjective assessment. For each scheme, prompts were designed by a sonographer and a prenatal diagnostician to guide the LLM. Due to their extensive length, the full verbatim prompts are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Nevertheless, all prompts adhered to a consistent design framework; they explicitly delineated the clinical rules and definitions for each classification scheme (as outlined below) and strategically incorporated specific explanations alongside clinical examples to disambiguate potentially confusing or overlapping findings. For classification purposes, suspected anomalies were treated as confirmed. The 5 schemes were as follows:</p><list list-type="order"><list-item><p>Primary classification: categorized reports into 4 mutually exclusive groups: increased nuchal translucency (NT), other soft markers, structural abnormalities, and fetal growth restriction (FGR).</p></list-item><list-item><p>Standardized terminology: converted descriptive text into standardized clinical terms while omitting laterality (eg, standardizing &#x201C;left ventriculomegaly&#x201D; to &#x201C;ventriculomegaly&#x201D;).</p></list-item><list-item><p>Anatomical system: mapped anomalies to the affected fetal system (eg, nervous and cardiovascular). Nonstructural or nonfetal findings were classified as &#x201C;None.&#x201D;</p></list-item><list-item><p>Abnormality count: Classified reports as &#x201C;Solitary&#x201D; (a single anomaly) or &#x201C;multiple&#x201D; (&#x2265;2 distinct anomalies, including bilateral presentations of a single finding).</p></list-item><list-item><p>Severity (subjective assessment): Findings were graded based on the most severe anomaly present, categorized into 3 levels according to standardized clinical criteria: lethal (eg, anencephaly, typically requiring termination of pregnancy), major (eg, complex congenital heart disease, significantly affecting viability), or minor (eg, cleft lip, surgically correctable postnatally). Nonstructural findings, such as NT or FGR, were categorized as &#x201C;Other.&#x201D;</p></list-item></list><p>For all schemes except &#x201C;abnormality count&#x201D; and &#x201C;severity,&#x201D; the model could generate multiple comma-separated outputs per report.</p></sec><sec id="s2-3"><title>LLM Execution and Evaluation</title><p>Two DeepSeek models were utilized sequentially. While both models belong to the DeepSeek-V3.2 family and share a highly efficient underlying architecture (incorporating mechanisms like DeepSeek Sparse Attention for rapid processing), their posttraining paradigms and inference mechanisms differ fundamentally [<xref ref-type="bibr" rid="ref20">20</xref>]. V3.2-B is optimized using standard supervised fine-tuning for rapid instruction following and direct pattern matching, making it highly efficient for objective factual extraction. In contrast, V3.2-R is a reasoning-enhanced variant heavily trained with a scalable reinforcement learning protocol to intrinsically generate intermediate CoT steps. During inference, V3.2-R allocates substantial additional computational resources to process an internal &#x201C;thinking&#x201D; phase before producing the final response. This architectural distinction mimics human &#x201C;System 2&#x201D; deliberate deduction, enabling it to handle subjective clinical nuances, albeit at the cost of significantly longer processing times.</p><p>First, the V3.2-B rapidly processed all reports for initial classification (2&#x2010;4 s/report). To ensure the reliability of the classification, 2 experienced attending prenatal diagnosticians independently evaluated the V3.2-B outputs in a double-blind manner, labeling each classification as &#x201C;Correct&#x201D; or &#x201C;Incorrect.&#x201D; Disagreements were resolved through consensus meetings guided by a senior chief diagnostician (YY) to form a preliminary reference standard. Because all discrepancies were ultimately resolved via 100% consensus, formal interrater reliability metrics were not calculated.</p><p>Subsequently, the senior chief diagnostician utilized this preliminary standard to evaluate V3.2-R outputs. This secondary expert review step was essential because V3.2-R occasionally presented valid but differently phrased classifications that required expert judgment rather than simple string-matching, thereby establishing the final &#x201C;gold standard&#x201D; dataset.</p><p>Both models were independently evaluated on the entire dataset across all 5 classification schemes. For this study, an <italic>F</italic><sub>1</sub>-score greater than 0.90 was prospectively defined as indicating highly reliable and clinically acceptable performance for the classification of prenatal ultrasound anomalies.</p></sec><sec id="s2-4"><title>Construction of Knowledge Base and Implementation of RAG</title><p>To evaluate the efficacy of RAG in enhancing model performance, the dataset of expert-verified severity assessments (n=254) was systematically partitioned based on sequential identification numbers. The first half (n=127) constituted the retrieval set, which was vectorized to construct the external knowledge base. The subsequent half (n=127) served as the unseen test set. While this sequential split shares local linguistic patterns and clinical workflows, it was intentionally designed to simulate a real-world scenario, in which a hospital utilizes its own historical records as a RAG knowledge base to process new, incoming reports of a similar style.</p><p>The RAG pipeline was deployed using the Dify platform. We integrated the Qwen3-Reranker-8B model for semantic reranking of candidate chunks. The retrieval parameters were configured with a Top-K of 3 and a similarity score threshold of 0.60 to filter out low-relevance noise. Finally, both V3.2-B and V3.2-R were tested on both the retrieval set and the test set to assess two critical metrics: (1) the effectiveness of RAG in retrieving and utilizing &#x201C;seen&#x201D; knowledge (performance on the retrieval set) and (2) the generalizability of the RAG-enhanced models when applied to &#x201C;unseen&#x201D; clinical data (performance on the test set).</p></sec><sec id="s2-5"><title>Follow-Up and Prenatal Diagnostic Outcomes</title><p>Among the 254 women with abnormal ultrasound findings, those with high-risk NIPT2.0 results were counseled for diagnostic amniocentesis. Definitive genetic testing included one or more of the following: karyotyping, chromosomal microarray analysis, whole-exome sequencing, and copy number variation sequencing. Cases were categorized based on these results. Patients who declined diagnostic testing after a high-risk NIPT2.0 result were excluded from the association analysis.</p><p>For the purpose of this study, a negative prenatal diagnostic outcome was assigned to all participants with a low-risk NIPT2.0 result. This approach was justified by the high negative predictive value of NIPT2.0 and supported by two observations in our cohort: (1) all women in this group who nonetheless underwent amniocentesis for sonographic indications had negative results and (2) clinicians did not recommend invasive testing for the remainder, judging the genetic risk to be low.</p></sec><sec id="s2-6"><title>Association Analysis Between Classified Ultrasound Abnormalities and Genetic Outcomes</title><p>An association analysis was performed on 251 eligible cases, correlating the classified ultrasound abnormalities with genetic diagnostic outcomes. This analysis utilized a &#x201C;gold standard&#x201D; dataset, which consisted of clinician-verified LLM classifications supplemented with manual corrections. The results were visualized using bar charts, showing the number and proportion of positive and negative genetic diagnoses for each anomaly category.</p><p>Prior to analysis, manual data curation was performed to standardize the LLM outputs for 3 classification schemes (standardized terminology, primary classification, and anatomical system). This step involved consolidating semantically identical but textually variant terms (eg, merging &#x201C;Increased NT&#x201D; and &#x201C;Increased nuchal translucency&#x201D;) and grouping identical combinations of findings listed in different orders to ensure accurate frequency counts for the association analysis.</p></sec><sec id="s2-7"><title>Statistical Analysis</title><p>To evaluate the performance of DeepSeek-V3.2, we calculated the overall accuracy for the categories. A true positive is recorded when an item in the model&#x2019;s output list also appears in the gold standard; a false positive is an output item absent from the gold standard; a false negative is a gold-standard item missing from the output list. The <italic>F</italic><sub>1</sub>-score was calculated as follows:</p><disp-formula id="equWL1"><mml:math id="eqn1"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:msub><mml:mrow><mml:mi mathvariant="italic">F</mml:mi></mml:mrow><mml:mrow><mml:mn>1</mml:mn></mml:mrow></mml:msub><mml:mo>=</mml:mo><mml:mn>2</mml:mn><mml:mo>&#x00D7;</mml:mo><mml:mfrac><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>&#x00D7;</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow><mml:mrow><mml:mrow><mml:mi mathvariant="normal">P</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">s</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">i</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">n</mml:mi></mml:mrow><mml:mo>+</mml:mo><mml:mrow><mml:mi mathvariant="normal">R</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">e</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">l</mml:mi></mml:mrow></mml:mrow></mml:mfrac></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>For the multiclass &#x201C;Severity&#x201D; classification, macroaveraged precision, recall, and <italic>F</italic><sub>1</sub>-scores were calculated to assess balanced performance across all categories:</p><disp-formula id="equWL2"><mml:math id="eqn2"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mstyle displaystyle="true" scriptlevel="0"><mml:mrow><mml:mi mathvariant="normal">M</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">a</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">c</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">r</mml:mi></mml:mrow><mml:mrow><mml:mi mathvariant="normal">o</mml:mi></mml:mrow><mml:mo>&#x2212;</mml:mo><mml:msub><mml:mrow><mml:mi mathvariant="italic">F</mml:mi></mml:mrow><mml:mn>1</mml:mn></mml:msub><mml:mo>=</mml:mo><mml:mfrac><mml:mn>1</mml:mn><mml:mi>N</mml:mi></mml:mfrac><mml:munderover><mml:mo movablelimits="false">&#x2211;</mml:mo><mml:mrow><mml:mi>i</mml:mi><mml:mo>=</mml:mo><mml:mn>1</mml:mn></mml:mrow><mml:mrow><mml:mi>N</mml:mi></mml:mrow></mml:munderover><mml:msub><mml:mrow><mml:mi mathvariant="normal">F</mml:mi></mml:mrow><mml:mrow><mml:msub><mml:mn>1</mml:mn><mml:mrow><mml:mi>i</mml:mi></mml:mrow></mml:msub></mml:mrow></mml:msub></mml:mstyle></mml:mstyle></mml:mrow></mml:mstyle></mml:math></disp-formula><p>where <italic>N</italic> is the number of classes. Additionally, we calculated the precision, recall, and <italic>F</italic><sub>1</sub>-score for each of the 4 severity categories individually.</p><p>For the association analysis, Pearson&#x2019;s <italic>&#x03C7;</italic><sup>2</sup> test was used to compare the rate of positive genetic diagnoses between the solitary and multiple abnormality categories, with significance set at <italic>P</italic>&#x003C;.05. Due to the limited number of positive cases within the numerous categories of the other classifications, these were analyzed using descriptive statistics. The results were visualized as the number and proportion of cases with positive versus negative genetic diagnoses for each abnormality type.</p><p>The difference in accuracy between the pre- and post-RAG phases was analyzed using the McNemar test for paired categorical data. Continuity correction was applied where appropriate. Statistical significance was defined as a 2-sided <italic>P</italic>&#x003C;.05. Data analysis was conducted using Python 3.10.</p></sec><sec id="s2-8"><title>Ethical Considerations</title><p>This comparative effectiveness study utilized the existing data from a prospectively registered cohort. The cohort protocol was registered with the Medical Research Registration and Filing Information System of the National Health Security Information Platform of China (registration number MR-11-24-002508). The study was conducted in accordance with the Declaration of Helsinki and approved by the institutional review board (2023-KY-099&#x2010;02). All participants in the original cohort provided written informed consent, agreeing to the use of their deidentified clinical information for scientific research purposes. As this study constitutes a secondary analysis of existing, deidentified data for comparative effectiveness purposes without causal inference, no additional patient contact or consent was required.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Screening and Identification of Cases With Positive Amniocentesis Results</title><p>Of the 254 women with abnormal ultrasound findings, 30 had high-risk NIPT2.0 results. Three of these women, all with a high risk for Trisomy 21, declined amniocentesis and were therefore excluded from the association analysis. The remaining 27 women underwent amniocentesis, all of whom were confirmed to have a positive genetic diagnosis. This resulted in a final cohort of 251 cases for the association analysis, comprising 27 positive and 224 negative genetic outcomes.</p></sec><sec id="s3-2"><title>Evaluation of LLMs&#x2019; Performance in Multidimensional Classification</title><p>V3.2-B demonstrated high accuracy (&#x003E;90%) and <italic>F</italic><sub>1</sub>-score (&#x003E;0.9) across 4 fact-based classifications (<xref ref-type="table" rid="table1">Table 1</xref>). Specifically, its accuracy was 98.4% (250/254) for standardized terminology, 92.9% (236/254) for primary classification, 90.1% (229/254) for anatomical system, and 98.4% (250/254) for abnormality count. When independently applied to the entire dataset for these 4 objective tasks, V3.2-R achieved higher performance metrics across all categories compared to V3.2-B. However, because the performance of the base model was already exceptionally high, the improvements provided by the reasoning model were marginal (a maximum accuracy difference of 4.7% in the primary classification).</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance of the DeepSeek-V3.2 in classifying fetal ultrasound abnormalities (N=254).</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Classification scheme</td><td align="left" valign="bottom">Accuracy (V3.2-B)<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>, n (%)</td><td align="left" valign="bottom">Accuracy (V3.2-R)<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup>, n (%)</td><td align="left" valign="bottom">Precision</td><td align="left" valign="bottom">Recall</td><td align="left" valign="bottom"><italic>F</italic><sub>1</sub>-score</td></tr></thead><tbody><tr><td align="left" valign="top">Standardized terminology</td><td align="left" valign="top">250 (98.4)</td><td align="left" valign="top">254 (100)</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td></tr><tr><td align="left" valign="top">Primary classification</td><td align="left" valign="top">236 (92.9)</td><td align="left" valign="top">248 (97.6)</td><td align="left" valign="top">0.94</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.95</td></tr><tr><td align="left" valign="top">Anatomical system</td><td align="left" valign="top">229 (90.1)</td><td align="left" valign="top">240 (94.4)</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.93</td><td align="left" valign="top">0.91</td></tr><tr><td align="left" valign="top">Abnormality count</td><td align="left" valign="top">250 (98.4)</td><td align="left" valign="top">254 (100)</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.98</td></tr><tr><td align="left" valign="top">Severity classification<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">144 (56.6)</td><td align="left" valign="top">215 (84.6)</td><td align="left" valign="top">0.83<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">0.77<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">0.75<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td></tr><tr><td align="left" valign="top">&#x2003;Lethal (n=10)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.57</td></tr><tr><td align="left" valign="top">&#x2003;Major (n=31)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.53</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.70</td></tr><tr><td align="left" valign="top">&#x2003;Minor (n=55)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.82</td><td align="left" valign="top">0.80</td></tr><tr><td align="left" valign="top">&#x2003;Other (n=158)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.85</td><td align="left" valign="top">0.92</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>V3.2-B: DeepSeek-V3.2 base model.</p></fn><fn id="table1fn2"><p><sup>b</sup>V3.2-R: DeepSeek-V3.2 reasoning-enhanced model.</p></fn><fn id="table1fn3"><p><sup>c</sup>For the 4 factual classifications (excluding severity classification), the precision, recall, and <italic>F</italic><sub>1</sub>-score metrics reported in the subsequent analysis are derived from V3.2-B, as its initial performance achieved the predefined high-level threshold of this study (<italic>F</italic><sub>1</sub>-score &#x003E;0.90). For the severity classification, the detailed performance metrics reported are based on V3.2-R, as the initial accuracy of V3.2-B was inadequate.</p></fn><fn id="table1fn4"><p><sup>d</sup>Not applicable.</p></fn></table-wrap-foot></table-wrap><p>In contrast, the &#x201C;Severity&#x201D; classification proved more challenging. V3.2-B&#x2019;s accuracy was only 56.7% (144/254), while V3.2-R significantly improved to 84.6% (215/254), with a macro-<italic>F</italic><sub>1</sub>-score of 0.75.</p><p>A detailed breakdown of V3.2-R&#x2019;s performance on the severity task revealed trade-offs: it achieved perfect precision for &#x201C;lethal malformation&#x201D; but with low recall (0.40), while for &#x201C;major malformation,&#x201D; it achieved perfect recall at the cost of lower precision (0.53). The LLM performed best when classifying findings into the &#x201C;Other&#x201D; category (<italic>F</italic><sub>1</sub>-score=0.92).</p></sec><sec id="s3-3"><title>Comparative Efficacy of RAG Versus CoT Reasoning in Severity Assessment</title><p>We evaluated the performance of the V3.2-B and V3.2-R models across both the internal retrieval set and the external test set, before and after the implementation of RAG (<xref ref-type="table" rid="table2">Table 2</xref>).</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparative performance of DeepSeek-V3.2 base and reasoning models for fetal anomaly severity assessment before and after RAG<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model and set</td><td align="left" valign="bottom" colspan="4">Before RAG<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> (n=127)</td><td align="left" valign="bottom" colspan="4">After RAG (n=127)</td><td align="left" valign="bottom"><italic>P</italic> value<sup><xref ref-type="table-fn" rid="table2fn3">c</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top"/><td align="left" valign="top">Accuracy</td><td align="left" valign="top">Precision</td><td align="left" valign="top">Recall</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">Accuracy</td><td align="left" valign="top">Precision</td><td align="left" valign="top">Recall</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top"/></tr><tr><td align="left" valign="top" colspan="10">V3.2-B<sup><xref ref-type="table-fn" rid="table2fn4">d</xref></sup></td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retrieval</td><td align="left" valign="top">0.56</td><td align="left" valign="top">0.45</td><td align="left" valign="top">0.62</td><td align="left" valign="top">0.48</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.61</td><td align="left" valign="top">0.68</td><td align="left" valign="top">0.61</td><td align="left" valign="top">.002</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lethal (n=4)</td><td align="left" valign="top">&#x2014;<sup><xref ref-type="table-fn" rid="table2fn5">e</xref></sup></td><td align="left" valign="top">0.57</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.72</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.60</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Major (n=19)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.29</td><td align="left" valign="top">0.84</td><td align="left" valign="top">0.43</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.40</td><td align="left" valign="top">0.79</td><td align="left" valign="top">0.54</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Minor (n=26)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.62</td><td align="left" valign="top">0.38</td><td align="left" valign="top">0.48</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other (n=78)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.96</td><td align="left" valign="top">0.65</td><td align="left" valign="top">0.78</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.78</td><td align="left" valign="top">0.84</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">0.57</td><td align="left" valign="top">0.43</td><td align="left" valign="top">0.62</td><td align="left" valign="top">0.46</td><td align="left" valign="top">0.59</td><td align="left" valign="top">0.49</td><td align="left" valign="top">0.59</td><td align="left" valign="top">0.49</td><td align="left" valign="top">.61</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lethal (n=6)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.54</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.71</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.44</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.53</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Major (n=12)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.18</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.29</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.25</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.35</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Minor (n=29)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">0</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.30</td><td align="left" valign="top">0.24</td><td align="left" valign="top">0.27</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other (n=80)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.98</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.83</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.81</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top">V3.2-R<sup><xref ref-type="table-fn" rid="table2fn6">f</xref></sup></td><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/><td align="left" valign="top"/></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Retrieval</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.72</td><td align="left" valign="top">0.70</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.99</td><td align="left" valign="top">&#x003C;.001</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lethal (n=4)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.25</td><td align="left" valign="top">0.40</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Major (n=19)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.58</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.73</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">1.00</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Minor (n=26)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.75</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.78</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.96</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.98</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other (n=78)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.91</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.99</td><td align="left" valign="top">0.99</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Test</td><td align="left" valign="top">0.86</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.80</td><td align="left" valign="top">0.77</td><td align="left" valign="top">0.81</td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.69</td><td align="left" valign="top">.33</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Lethal (n=6)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.67</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.60</td><td align="left" valign="top">0.50</td><td align="left" valign="top">0.50</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Major (n=12)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.48</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.65</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.53</td><td align="left" valign="top">0.67</td><td align="left" valign="top">0.59</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Minor (n=29)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.83</td><td align="left" valign="top">0.83</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.71</td><td align="left" valign="top">0.69</td><td align="left" valign="top">0.70</td><td align="left" valign="top">&#x2014;</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>Other (n=80)</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">1.00</td><td align="left" valign="top">0.87</td><td align="left" valign="top">0.93</td><td align="left" valign="top">&#x2014;</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.90</td><td align="left" valign="top">0.90</td><td align="left" valign="top">&#x2014;</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>Data represent the performance metrics for the internal retrieval set (n=127) and the external test set (n=127). The retrieval set consists of data used to construct the RAG vector database, whereas the test set comprises the unseen data.</p></fn><fn id="table2fn2"><p><sup>b</sup>RAG: retrieval-augmented generation.</p></fn><fn id="table2fn3"><p><sup>c</sup><italic>P</italic> values were calculated using the McNemar test to determine the statistical significance of the difference in overall accuracy before and after the implementation of RAG for each model.</p></fn><fn id="table2fn4"><p><sup>d</sup>V3.2-B: DeepSeek-V3.2 base model.</p></fn><fn id="table2fn5"><p><sup>e</sup>Not applicable.</p></fn><fn id="table2fn6"><p><sup>f</sup>V3.2-R: DeepSeek-V3.2 reasoning-enhanced model.</p></fn></table-wrap-foot></table-wrap><p>V3.2-R demonstrated a substantial performance advantage over V3.2-B prior to the application of RAG. On the test set, V3.2-R achieved an accuracy of 86%, significantly outperforming V3.2-B, which achieved only 57%. Notably, in the classification of &#x201C;Minor&#x201D; anomalies, V3.2-B completely failed to identify any cases (precision, recall, and <italic>F</italic><sub>1</sub>-score=0), whereas V3.2-R achieved a high <italic>F</italic><sub>1</sub>-score of 0.83. This disparity highlights the intrinsic limitation of V3.2-B in handling subjective severity grading without explicit reasoning capabilities.</p><p>When applied to the retrieval set, RAG significantly improved the performance of both models. The accuracy of V3.2-B increased from 56% to 70% (<italic>P</italic>=.002), with the <italic>F</italic><sub>1</sub>-score for &#x201C;Minor&#x201D; anomalies rising from 0 to 0.48, indicating that the model successfully retrieved relevant examples to correct its output. Moreover, V3.2-R achieved near-perfect performance with RAG, improving accuracy from 83% to 99% (<italic>P</italic>&#x003C;.001). Precision and recall metrics across all severity subtypes approached or reached 1.00. These results confirm that the RAG pipeline was technically functional and capable of enhancing performance when the test data were semantically identical or highly similar to the knowledge base.</p><p>Crucially, the performance gains observed in the retrieval set did not translate to the external test set, revealing a critical limitation in the generalizability of RAG for this specific task. V3.2-B showed no statistically significant improvement with RAG (accuracy: 57% vs 59%, <italic>P</italic>=.61). While RAG slightly improved the detection of &#x201C;Minor&#x201D; anomalies (<italic>F</italic><sub>1</sub>-score increased to 0.27), the overall capability remained suboptimal compared to the reasoning model. The implementation of RAG on V3.2-R resulted in a slight, though not statistically significant, decline in accuracy on the external test set (86% vs 81%, <italic>P</italic>=.33). Specifically, the <italic>F</italic><sub>1</sub>-scores for &#x201C;Lethal&#x201D; and &#x201C;Major&#x201D; anomalies decreased after RAG (lethal: 0.67-0.50; major: 0.65-0.59).</p><p>These data indicate that while RAG can effectively guide LLMs to memorize or retrieve specific patterns within the knowledge base, it fails to significantly enhance&#x2014;and may potentially hinder&#x2014;performance on unseen, heterogeneous clinical data. In contrast, the CoT reasoning inherent in V3.2-R (without RAG) proved to be the most robust approach for the subjective task of severity assessment, achieving the highest stand-alone accuracy (86%) and <italic>F</italic><sub>1</sub>-scores on the external validation cohort.</p></sec><sec id="s3-4"><title>Descriptive Statistics of Prenatal Ultrasound Abnormalities</title><p>A descriptive analysis of the curated dataset revealed the distribution of abnormalities across the 5 classification schemes (<xref ref-type="fig" rid="figure2">Figure 2</xref>).</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Distribution of prenatal ultrasound abnormalities based on the manually curated classifications generated by DeepSeek-V3.2. (A) Frequencies of standardized medical terms for ultrasound findings. The model standardized unstructured descriptions into common terms, generating 371 entries in total. Only terms with a frequency greater than 1% are displayed. (B) Distribution of abnormalities by the affected anatomical system (n=176). (C) Distribution of cases by severity classification (n=254).</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e91399_fig02.png"/></fig><p>By standardized terminology, the most frequent among the 371 identified findings were increased NT (88/371, 23.7%), FGR (24/371, 6.5%), choroid plexus cyst (21/371, 5.7%), and single umbilical artery (13/371, 3.5%; <xref ref-type="fig" rid="figure2">Figure 2</xref>A). Most reports described solitary findings (171/254, 67.3%) rather than multiple findings (83/254, 32.7%). Among 176 classified structural anomalies, the cardiovascular (58/176, 33%) and nervous (43/176, 24.4%) systems were the most commonly affected (<xref ref-type="fig" rid="figure2">Figure 2</xref>B). The primary classifications were distributed among increased NT (96/292, 32.5%), other soft markers (83/292, 28.4%), structural abnormalities (82/292, 28.1%), and FGR (32/292, 11%). Finally, by severity, most cases were categorized as &#x201C;Other&#x201D; (158/254, 62.2%), followed by minor (55/254, 21.7%), major (31/254, 12.2%), and lethal (10/254, 3.9%) malformations (<xref ref-type="fig" rid="figure2">Figure 2</xref>C).</p></sec><sec id="s3-5"><title>Association Analysis</title><p>An association analysis correlated the classified ultrasound findings with genetic outcomes (<xref ref-type="fig" rid="figure3">Figure 3</xref>). The presence of multiple abnormalities significantly increased the risk of a positive genetic diagnosis (<xref ref-type="fig" rid="figure3">Figure 3</xref>) compared to solitary findings (14/82, 17.1% vs 13/169, 7.7%).</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Association analysis between classified prenatal ultrasound abnormalities and genetic diagnostic outcomes. The bar charts display the number and proportion of cases with negative (blue) and positive (red) prenatal diagnoses for each classification scheme. (A) Analysis by standardized terminology. This panel shows all 27 positive cases and their associated ultrasound findings. For clarity, only findings from the negative diagnosis group with a frequency greater than 2 are displayed. (B) Analysis by primary classification. (C) Analysis by the anatomical system. (D) Analysis by the abnormality count. The rate of positive diagnoses was significantly higher in cases with multiple abnormalities compared to those with solitary findings (<italic>P</italic>=.02, Pearson <italic>&#x03C7;</italic><sup>2</sup> test). (E) Analysis by severity (subjective assessment). AVSD: atrioventricular septal defect; DA: ductus arteriosus; DV: ductus venosus; FGR: fetal growth restriction; NT: nuchal translucency; RR: relative risk; VSD: ventricular septal defect.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e91399_fig03.png"/></fig><p>Among solitary findings, the increased NT was the most common abnormality associated with a positive genetic diagnosis (n=9; <xref ref-type="fig" rid="figure3">Figures 3</xref>A and 3B). Notably, no positive diagnoses were found in cases with isolated choroid plexus cysts, ventriculomegaly, echogenic bowel, or ventricular septal defects (<xref ref-type="fig" rid="figure3">Figure 3</xref>A).</p><p>Risk is also correlated with the anatomical system. Among cases with positive genetic diagnoses, cardiovascular and lymphatic system abnormalities were the most frequent. Conversely, no positive diagnoses in this cohort were associated with isolated anomalies of the urinary, digestive, or musculoskeletal systems (<xref ref-type="fig" rid="figure3">Figure 3</xref>C).</p><p>More importantly, lethal and major malformations were disproportionately represented in the positive diagnosis cohort (<xref ref-type="fig" rid="figure3">Figure 3</xref>E), accounting for 14.8% (4/27) and 18.5% (5/27) of positive cases, respectively. In addition, positive cases constituted 44.44% (4/9) of all lethal malformations and 16.13% (5/31) of all major malformations but only 7.14% (4/56) of minor malformations.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Results</title><p>This study establishes an adaptive &#x201C;fast-slow&#x201D; framework utilizing the open-source DeepSeek-V3.2 family for the automated, multidimensional classification of prenatal ultrasound reports. By strategically deploying a high-speed base model for factual extraction and a reasoning-enhanced model for subjective assessment, our approach significantly enhances data annotation efficiency while resolving the complexity of phenotype validation. Crucially, we identified a pivotal mechanistic dichotomy; while RAG improves performance on the data seen within the knowledge base, it fails to generalize to external subjective tasks. In contrast, CoT reasoning demonstrates superior robustness in &#x201C;unseen&#x201D; scenarios, effectively mimicking the &#x201C;System 2&#x201D; clinical judgment required for severity grading. This work provides a foundational pipeline for phenotype-driven research using unstructured hospital data and offers a reliable tool to support clinical decision-making, highlighting the necessity of matching model cognitive architectures to clinical task complexity.</p></sec><sec id="s4-2"><title>Limitations</title><p>This study has limitations. First, while we highlighted the superiority of CoT over RAG for subjective tasks, our RAG implementation utilized a specific reranking strategy (Qwen3-Reranker). Alternative retrieval algorithms or hybrid approaches might yield different results. Second, the cohort, while expertly annotated, is relatively small (n=254) and derived from a single center, and the fact that the &#x201C;gold standard&#x201D; annotations were established through the expert review of model outputs rather than being fully independent inevitably introduces a degree of subjectivity. Consequently, certain clinically important subcategories, such as lethal malformations, are represented by very small sample sizes, which may affect the statistical stability of our performance metrics within these subgroups. Furthermore, our evaluation utilized an internal sequential split. While the strict separation of patient IDs prevented direct data leakage, the unseen test set shared local linguistic patterns with the retrieval knowledge base. This setup inherently provides an optimistic performance estimate for the RAG pipeline. However, this constraint actually reinforces our primary findings; even under conditions highly favorable to retrieval, CoT reasoning still demonstrated superior robustness for subjective tasks. While this single-center design accurately simulates local clinical deployment&#x2014;where a hospital utilizes its own historical records&#x2014;it does not evaluate true out-of-distribution generalizability. Broader generalizability across different institutional reporting styles remains to be established through future multicenter or larger-scale studies. Third, we did not formally evaluate prompt engineering variations [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], although reasoning models typically show resilience to prompt nuances [<xref ref-type="bibr" rid="ref24">24</xref>]. Fourth, potential confounders, such as gestational age, were not integrated into the model&#x2019;s decision logic, warranting future multimodal investigations. Finally, it is also important to acknowledge limitations regarding our clinical end points. For one, treating all low-risk NIPT2.0 cases as negative genetic outcomes without universal confirmatory amniocentesis&#x2014;while clinically justified by the test&#x2019;s high negative predictive value and our cohort observations&#x2014;may introduce a minor risk of misclassification bias. Statistically, this inherent uncertainty of screening-based negatives could potentially lead to a slight underestimation of the true genetic risk associated with specific ultrasound phenotypes. Furthermore, our validation specifically targeted pathogenic genetic risk to support invasive diagnostic decision-making. Because ultrasound anomalies frequently arise from heterogeneous nongenetic etiologies, a more comprehensive assessment of the framework&#x2019;s overall clinical utility will require future studies integrating additional longitudinal end points, such as postnatal diagnoses and long-term functional outcomes.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>While LLMs have shown broad capabilities across medicine [<xref ref-type="bibr" rid="ref25">25</xref>-<xref ref-type="bibr" rid="ref47">47</xref>], a &#x201C;one-size-fits-all&#x201D; approach remains inefficient for complex clinical workflows. Our findings challenge the prevailing assumption that RAG is the universal solution for medical LLM hallucinations. In our study, RAG successfully corrected the base model&#x2019;s errors within the retrieval set, confirming its utility for pattern matching. However, this performance collapsed on the external test set. This suggests that for subjective tasks like severity assessment&#x2014;which rely on synthesizing subtle cues&#x2014;semantic retrieval is insufficient. RAG retrieves similar text chunks but not necessarily the logic of the diagnosis. Conversely, the V3.2-R model, utilizing CoT, achieved an 86% accuracy on the external set without accessing the knowledge base. This indicates that internalized reasoning capabilities (navigating clinical logic steps) are more critical than external knowledge retrieval (accessing facts) when dealing with the nuanced subjectivity of fetal anomalies [<xref ref-type="bibr" rid="ref44">44</xref>]. Notably, introducing RAG to V3.2-R degraded performance to 81%, suggesting potential noise interference.</p><p>Unlike commercial proprietary models, the open-source nature of the DeepSeek suite allows for secure local deployment, ensuring patient data privacy&#x2014;a nonnegotiable requirement for handling sensitive prenatal records [<xref ref-type="bibr" rid="ref39">39</xref>,<xref ref-type="bibr" rid="ref40">40</xref>,<xref ref-type="bibr" rid="ref48">48</xref>-<xref ref-type="bibr" rid="ref51">51</xref>]. Our framework maximizes resource efficiency: by routing the majority of straightforward tasks (entity extraction and counting) to the &#x201C;Fast&#x201D; V3.2-B model, we preserve the computationally expensive &#x201C;Slow&#x201D; V3.2-R model only for tasks where it provides a statistically significant benefit. This tiered approach addresses the processing speed bottlenecks often cited as a barrier to deploying reasoning models in real-time clinical settings [<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. Admittedly, the DeepSeek LLMs in this study did not achieve perfect accuracy across all classification tasks; even V-3.2R achieved only 84.6% accuracy in the subjective severity grading of the entire dataset. However, model performance is expected to improve with future advancements in open-source LLMs. Furthermore, while no current LLM can fully replace specific clinical practice, they are already sufficient to significantly enhance efficiency.</p><p>The ultimate value of this technical framework lies in its clinical utility. By enabling high-throughput, multidimensional classification, we were able to conduct an association analysis between sonographic phenotypes and genetic outcomes. However, these association results are exploratory and specific to our single-center cohort. Establishing higher-level evidence for the strength of phenotype-genotype correlations would necessitate much larger, multicenter datasets and more rigorous statistical validation to ensure broader clinical applicability. The fundamental aim of this analysis was to illustrate the practical clinical significance of the multidimensional profiles generated by our LLM framework. Our automated severity grading successfully stratified patients into distinct risk categories, confirming established high-risk predictors, such as multiple anomalies and specific system involvement (eg, cardiovascular) [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref54">54</xref>-<xref ref-type="bibr" rid="ref58">58</xref>]. The decreasing genetic risk observed across lethal (4/9, 44.44%), major (5/31, 16.13%), and minor (4/56, 7.14%) malformations demonstrates that subjective severity grading is an indispensable dimension for phenotype-driven diagnosis. Importantly, the reasoning model accurately identified &#x201C;Minor&#x201D; malformations&#x2014;a category the base model completely missed. This granularity provides quantitative support for the lower (yet nonnegligible) risk nature of isolated markers, potentially aiding in reducing unnecessary invasive procedures for low-risk findings [<xref ref-type="bibr" rid="ref59">59</xref>], while ensuring that subtle but significant patterns are not overlooked [<xref ref-type="bibr" rid="ref60">60</xref>]. This validates that our &#x201C;human-in-the-loop&#x201D; LLM framework does not merely digitize text but actively contributes to refining risk stratification.</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study demonstrates that a monolithic LLM strategy is insufficient for the diverse challenges of prenatal diagnosis. We propose an adaptive framework where &#x201C;Fast&#x201D; models handle factual extraction, while &#x201C;Slow&#x201D; reasoning models are prioritized for subjective clinical assessment, as they demonstrated greater robustness than our specific RAG implementation in this cohort. However, this finding is contextualized within our current experimental framework and does not constitute a generalized conclusion regarding the inherent superiority of CoT over RAG. By aligning the cognitive architecture of LLM agents with the cognitive demands of medical tasks, we offer a scalable, privacy-preserving path to transform unstructured ultrasound narratives into actionable, phenotype-driven clinical intelligence.</p></sec></sec></body><back><ack><p>No generative artificial intelligence tools were used at any stage in the preparation of this manuscript.</p></ack><notes><sec><title>Funding</title><p>This study was supported by the National Key Research and Development Program of China (2023YFC2705600), the Capital Clinical Characteristic Diagnosis and Treatment Technology Research and Transformation Application Project (Z221100007422012), the Beijing Hospital Management Center "Yangfan" Plan 3.0 Clinical Technology Innovation Project (ZLRK202329), and the Science and Technology Innovation and Transformation Special Project of Beijing Obstetrics and Gynecology Hospital Affiliated to Capital Medical University/Beijing Maternal and Child Health Hospital (FCYYZH202201).</p></sec><sec><title>Data Availability</title><p>All data generated or analyzed during this study, including the full anonymized dataset (comprising original Chinese ultrasound texts, English translations, and specific amniocentesis outcomes), expert annotations, and full large language model (LLM) prompts, are included in this published article (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). The code used for data analysis, including the generation of <xref ref-type="fig" rid="figure2">Figures 2 and 3</xref>, evaluation of LLM performance metrics, and statistical analyses, is openly available in the public repository [<xref ref-type="bibr" rid="ref61">61</xref>]. Furthermore, to ensure full methodological reproducibility without requiring custom execution scripts, the exact workflow configurations and hyperparameters used for the LLM inference and retrieval-augmented generation pipeline via the open-source Dify platform are thoroughly documented in the Methods section.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: WZ, HY</p><p>Methodology: WZ, HY, Yifan Liu, KY</p><p>Investigation: WZ, HY, Yifan Liu, KY</p><p>Formal analysis: Yan Liu, HG, ZY</p><p>Visualization: WZ</p><p>Writing &#x2013; original draft: WZ, HY</p><p>Writing &#x2013; review &#x0026; editing: WZ, HY, Yifan Liu, Yan Liu, KY, HG, ZY, WH, YY, CY</p><p>Project administration: YY, CY</p><p>Supervision: YY, CY</p><p>Funding acquisition: YY, CY</p><p>All authors reviewed and approved the final manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">CoT</term><def><p>chain-of-thought</p></def></def-item><def-item><term id="abb2">FGR</term><def><p>fetal growth restriction</p></def></def-item><def-item><term id="abb3">LLMs</term><def><p>large language models</p></def></def-item><def-item><term id="abb4">NIPT2.0</term><def><p>enhanced noninvasive prenatal test</p></def></def-item><def-item><term id="abb5">NT</term><def><p>nuchal translucency</p></def></def-item><def-item><term id="abb6">RAG</term><def><p>retrieval-augmented generation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karim</surname><given-names>JN</given-names> </name><name name-style="western"><surname>Campbell</surname><given-names>H</given-names> </name><name name-style="western"><surname>Pandya</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Clinical and cost-effectiveness of detailed anomaly ultrasound screening in the first trimester: a mixed-methods study</article-title><source>Health Technol Assess</source><year>2025</year><month>05</month><volume>29</volume><issue>22</issue><fpage>1</fpage><lpage>250</lpage><pub-id pub-id-type="doi">10.3310/NLTP7102</pub-id><pub-id pub-id-type="medline">40455571</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rivero-Arias</surname><given-names>O</given-names> </name><name name-style="western"><surname>Png</surname><given-names>ME</given-names> </name><name name-style="western"><surname>White</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Benefits and harms of antenatal and newborn screening programmes in health economic assessments: the VALENTIA systematic review and qualitative investigation</article-title><source>Health Technol Assess</source><year>2024</year><month>06</month><volume>28</volume><issue>25</issue><fpage>1</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.3310/PYTK6591</pub-id><pub-id pub-id-type="medline">38938110</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ryan</surname><given-names>GA</given-names> </name><name name-style="western"><surname>Start</surname><given-names>AO</given-names> </name><name name-style="western"><surname>Cathcart</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Prenatal findings and associated survival rates in fetal ventriculomegaly: a prospective observational study</article-title><source>Int J Gynaecol Obstet</source><year>2022</year><month>12</month><volume>159</volume><issue>3</issue><fpage>891</fpage><lpage>897</lpage><pub-id pub-id-type="doi">10.1002/ijgo.14206</pub-id><pub-id pub-id-type="medline">35373343</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bergstr&#x00F6;m</surname><given-names>C</given-names> </name><name name-style="western"><surname>Ngarina</surname><given-names>M</given-names> </name><name name-style="western"><surname>Abeid</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Health professionals&#x2019; experiences and views on obstetric ultrasound in Tanzania: a cross-sectional study</article-title><source>Womens Health (Lond)</source><year>2024</year><volume>20</volume><fpage>17455057241273675</fpage><pub-id pub-id-type="doi">10.1177/17455057241273675</pub-id><pub-id pub-id-type="medline">39206633</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rossi</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Prefumo</surname><given-names>F</given-names> </name></person-group><article-title>Accuracy of ultrasonography at 11-14 weeks of gestation for detection of fetal structural anomalies: a systematic review</article-title><source>Obstet Gynecol</source><year>2013</year><month>12</month><volume>122</volume><issue>6</issue><fpage>1160</fpage><lpage>1167</lpage><pub-id pub-id-type="doi">10.1097/AOG.0000000000000015</pub-id><pub-id pub-id-type="medline">24201688</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>D</given-names> </name><name name-style="western"><surname>He</surname><given-names>JH</given-names> </name><etal/></person-group><article-title>Associations between genomic aberrations, increased nuchal translucency, and pregnancy outcomes: a comprehensive analysis of 2,272 singleton pregnancies in women under 35</article-title><source>Front Med (Lausanne)</source><year>2024</year><volume>11</volume><fpage>1376319</fpage><pub-id pub-id-type="doi">10.3389/fmed.2024.1376319</pub-id><pub-id pub-id-type="medline">38633307</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Su</surname><given-names>J</given-names> </name><name name-style="western"><surname>Qin</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Fu</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Association of prenatal renal ultrasound abnormalities with pathogenic copy number variants in a large Chinese cohort</article-title><source>Ultrasound Obstet Gynecol</source><year>2022</year><month>02</month><volume>59</volume><issue>2</issue><fpage>226</fpage><lpage>233</lpage><pub-id pub-id-type="doi">10.1002/uog.23702</pub-id><pub-id pub-id-type="medline">34090309</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibney</surname><given-names>E</given-names> </name></person-group><article-title>&#x2018;Another DeepSeek moment&#x2019;: Chinese AI model Kimi K2 stirs excitement</article-title><source>Nature New Biol</source><year>2025</year><month>07</month><day>24</day><volume>643</volume><issue>8073</issue><fpage>889</fpage><lpage>890</lpage><pub-id pub-id-type="doi">10.1038/d41586-025-02275-6</pub-id><pub-id pub-id-type="medline">40670748</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Normile</surname><given-names>D</given-names> </name></person-group><article-title>Chinese firm&#x2019;s large language model makes a splash</article-title><source>Science</source><year>2025</year><month>01</month><day>17</day><volume>387</volume><issue>6731</issue><fpage>238</fpage><pub-id pub-id-type="doi">10.1126/science.adv9836</pub-id><pub-id pub-id-type="medline">39818899</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gibney</surname><given-names>E</given-names> </name></person-group><article-title>China&#x2019;s cheap, open AI model DeepSeek thrills scientists</article-title><source>Nature</source><year>2025</year><month>02</month><volume>638</volume><issue>8049</issue><fpage>13</fpage><lpage>14</lpage><pub-id pub-id-type="doi">10.1038/d41586-025-00229-6</pub-id><pub-id pub-id-type="medline">39849139</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>D</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><etal/></person-group><article-title>DeepSeek-R1 incentivizes reasoning in LLMs through reinforcement learning</article-title><source>Nature New Biol</source><year>2025</year><month>09</month><volume>645</volume><issue>8081</issue><fpage>633</fpage><lpage>638</lpage><pub-id pub-id-type="doi">10.1038/s41586-025-09422-z</pub-id><pub-id pub-id-type="medline">40962978</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>H</given-names> </name><name name-style="western"><surname>Gu</surname><given-names>B</given-names> </name><etal/></person-group><article-title>Application of large language models in medicine</article-title><source>Nat Rev Bioeng</source><year>2025</year><month>06</month><volume>3</volume><issue>6</issue><fpage>445</fpage><lpage>464</lpage><pub-id pub-id-type="doi">10.1038/s44222-025-00279-5</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Amugongo</surname><given-names>LM</given-names> </name><name name-style="western"><surname>Mascheroni</surname><given-names>P</given-names> </name><name name-style="western"><surname>Brooks</surname><given-names>S</given-names> </name><name name-style="western"><surname>Doering</surname><given-names>S</given-names> </name><name name-style="western"><surname>Seidel</surname><given-names>J</given-names> </name></person-group><article-title>Retrieval augmented generation for large language models in healthcare: a systematic review</article-title><source>PLOS Digit Health</source><year>2025</year><month>06</month><volume>4</volume><issue>6</issue><fpage>e0000877</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000877</pub-id><pub-id pub-id-type="medline">40498738</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ahn</surname><given-names>S</given-names> </name></person-group><article-title>A guide to evade hallucinations and maintain reliability when using large language models for medical research: a narrative review</article-title><source>Ann Pediatr Endocrinol Metab</source><year>2025</year><month>06</month><volume>30</volume><issue>3</issue><fpage>115</fpage><lpage>118</lpage><pub-id pub-id-type="doi">10.6065/apem.2448278.139</pub-id><pub-id pub-id-type="medline">40624912</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shafir</surname><given-names>E</given-names> </name></person-group><article-title>Daniel Kahneman obituary: psychologist who revolutionized the way we think about thinking</article-title><source>Nature New Biol</source><year>2024</year><month>05</month><day>16</day><volume>629</volume><issue>8012</issue><fpage>526</fpage><pub-id pub-id-type="doi">10.1038/d41586-024-01344-6</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fischhoff</surname><given-names>B</given-names> </name></person-group><article-title>Daniel Kahneman (1934&#x2013;2024)</article-title><source>Science</source><year>2024</year><month>05</month><day>3</day><volume>384</volume><issue>6695</issue><fpage>515</fpage><lpage>515</lpage><pub-id pub-id-type="doi">10.1126/science.adp6405</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Prospective prenatal cell-free DNA screening for genetic conditions of heterogenous etiologies</article-title><source>Nat Med</source><year>2024</year><month>02</month><volume>30</volume><issue>2</issue><fpage>470</fpage><lpage>479</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02774-x</pub-id><pub-id pub-id-type="medline">38253798</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><article-title>LangGenius / dify</article-title><source>GitHub</source><year>2025</year><access-date>2025-12-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/langgenius/dify">https://github.com/langgenius/dify</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="web"><source>SiliconFlow</source><access-date>2025-08-09</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.siliconflow.com/">https://www.siliconflow.com/</ext-link></comment></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mei</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lin</surname><given-names>B</given-names> </name><etal/></person-group><article-title>DeepSeek-V3.2: pushing the frontier of open large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Dec 2, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2512.02556</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>H</given-names> </name><name name-style="western"><surname>Yin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Luo</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>X</given-names> </name></person-group><article-title>Integrating chemistry knowledge in large language models via prompt engineering</article-title><source>Synth Syst Biotechnol</source><year>2025</year><volume>10</volume><issue>1</issue><fpage>23</fpage><lpage>38</lpage><pub-id pub-id-type="doi">10.1016/j.synbio.2024.07.004</pub-id><pub-id pub-id-type="medline">39206087</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Schulhoff</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ilie</surname><given-names>M</given-names> </name><name name-style="western"><surname>Balepur</surname><given-names>N</given-names> </name><etal/></person-group><article-title>The prompt report: a systematic survey of prompt engineering techniques</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.06608</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Du</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Improving large language models for clinical named entity recognition via prompt engineering</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1812</fpage><lpage>1820</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad259</pub-id><pub-id pub-id-type="medline">38281112</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jeon</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>HG</given-names> </name></person-group><article-title>A comparative evaluation of chain-of-thought-based prompt engineering techniques for medical question answering</article-title><source>Comput Biol Med</source><year>2025</year><month>09</month><volume>196</volume><issue>Pt A</issue><fpage>110614</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2025.110614</pub-id><pub-id pub-id-type="medline">40602316</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Das</surname><given-names>A</given-names> </name><name name-style="western"><surname>Talati</surname><given-names>IA</given-names> </name><name name-style="western"><surname>Chaves</surname><given-names>JMZ</given-names> </name><name name-style="western"><surname>Rubin</surname><given-names>D</given-names> </name><name name-style="western"><surname>Banerjee</surname><given-names>I</given-names> </name></person-group><article-title>Weakly supervised language models for automated extraction of critical findings from radiology reports</article-title><source>NPJ Digit Med</source><year>2025</year><month>05</month><day>8</day><volume>8</volume><issue>1</issue><fpage>257</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01522-4</pub-id><pub-id pub-id-type="medline">40341617</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shyr</surname><given-names>C</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bastarache</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Identifying and extracting rare diseases and their phenotypes with large language models</article-title><source>J Healthc Inform Res</source><year>2024</year><month>06</month><volume>8</volume><issue>2</issue><fpage>438</fpage><lpage>461</lpage><pub-id pub-id-type="doi">10.1007/s41666-023-00155-0</pub-id><pub-id pub-id-type="medline">38681753</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Keshavarz</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bagherieh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Nabipoorashrafi</surname><given-names>SA</given-names> </name><etal/></person-group><article-title>ChatGPT in radiology: a systematic review of performance, pitfalls, and future perspectives</article-title><source>Diagn Interv Imaging</source><year>2024</year><volume>105</volume><issue>7-8</issue><fpage>251</fpage><lpage>265</lpage><pub-id pub-id-type="doi">10.1016/j.diii.2024.04.003</pub-id><pub-id pub-id-type="medline">38679540</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hasani</surname><given-names>AM</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zahergivar</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Evaluating the performance of Generative Pre-Trained Transformer-4 (GPT-4) in standardizing radiology reports</article-title><source>Eur Radiol</source><year>2024</year><month>06</month><volume>34</volume><issue>6</issue><fpage>3566</fpage><lpage>3574</lpage><pub-id pub-id-type="doi">10.1007/s00330-023-10384-x</pub-id><pub-id pub-id-type="medline">37938381</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Miao</surname><given-names>BY</given-names> </name><name name-style="western"><surname>Williams</surname><given-names>CYK</given-names> </name><name name-style="western"><surname>Chinedu-Eneh</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Understanding contraceptive switching rationales from real world clinical notes using large language models</article-title><source>NPJ Digit Med</source><year>2025</year><month>04</month><day>23</day><volume>8</volume><issue>1</issue><fpage>221</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01615-0</pub-id><pub-id pub-id-type="medline">40269253</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>WI</given-names> </name><name name-style="western"><surname>Leung</surname><given-names>CLK</given-names> </name><name name-style="western"><surname>Tang</surname><given-names>A</given-names> </name><name name-style="western"><surname>McNeil</surname><given-names>EB</given-names> </name><name name-style="western"><surname>Wong</surname><given-names>SYS</given-names> </name><name name-style="western"><surname>Kwok</surname><given-names>KO</given-names> </name></person-group><article-title>Extracting symptoms from free-text responses using ChatGPT among COVID-19 cases in Hong Kong</article-title><source>Clin Microbiol Infect</source><year>2024</year><month>01</month><volume>30</volume><issue>1</issue><fpage>142</fpage><pub-id pub-id-type="doi">10.1016/j.cmi.2023.11.002</pub-id><pub-id pub-id-type="medline">37949111</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><name name-style="western"><surname>Leonte</surname><given-names>KG</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>ML</given-names> </name><etal/></person-group><article-title>Large language models outperform mental and medical health care professionals in identifying obsessive-compulsive disorder</article-title><source>NPJ Digit Med</source><year>2024</year><month>07</month><day>19</day><volume>7</volume><issue>1</issue><fpage>193</fpage><pub-id pub-id-type="doi">10.1038/s41746-024-01181-x</pub-id><pub-id pub-id-type="medline">39030292</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Salem</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Gale</surname><given-names>RC</given-names> </name><name name-style="western"><surname>Fleegle</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fergadiotis</surname><given-names>G</given-names> </name><name name-style="western"><surname>Bedrick</surname><given-names>S</given-names> </name></person-group><article-title>Automating intended target identification for paraphasias in discourse using a large language model</article-title><source>J Speech Lang Hear Res</source><year>2023</year><month>12</month><day>11</day><volume>66</volume><issue>12</issue><fpage>4949</fpage><lpage>4966</lpage><pub-id pub-id-type="doi">10.1044/2023_JSLHR-23-00121</pub-id><pub-id pub-id-type="medline">37931137</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bellini</surname><given-names>D</given-names> </name><name name-style="western"><surname>Ferrari</surname><given-names>R</given-names> </name><name name-style="western"><surname>Vicini</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rengo</surname><given-names>M</given-names> </name><name name-style="western"><surname>Saletti</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Carbone</surname><given-names>I</given-names> </name></person-group><article-title>Hi ChatGPT, I am a radiologist, how can you help me?</article-title><source>Radiol Med</source><year>2025</year><month>08</month><volume>130</volume><issue>8</issue><fpage>1221</fpage><lpage>1230</lpage><pub-id pub-id-type="doi">10.1007/s11547-025-02053-4</pub-id><pub-id pub-id-type="medline">40699279</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Le Guellec</surname><given-names>B</given-names> </name><name name-style="western"><surname>Lef&#x00E8;vre</surname><given-names>A</given-names> </name><name name-style="western"><surname>Geay</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Performance of an open-source large language model in extracting information from free-text radiology reports</article-title><source>Radiol Artif Intell</source><year>2024</year><month>07</month><volume>6</volume><issue>4</issue><fpage>e230364</fpage><pub-id pub-id-type="doi">10.1148/ryai.230364</pub-id><pub-id pub-id-type="medline">38717292</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>SB</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Surge in large language models exacerbates global regional healthcare inequalities</article-title><source>J Transl Med</source><year>2025</year><month>07</month><day>1</day><volume>23</volume><issue>1</issue><fpage>706</fpage><pub-id pub-id-type="doi">10.1186/s12967-025-06751-5</pub-id><pub-id pub-id-type="medline">40597368</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rao</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Assessing the utility of ChatGPT throughout the entire clinical workflow: development and usability study</article-title><source>J Med Internet Res</source><year>2023</year><month>08</month><day>22</day><volume>25</volume><fpage>e48659</fpage><pub-id pub-id-type="doi">10.2196/48659</pub-id><pub-id pub-id-type="medline">37606976</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Habib</surname><given-names>S</given-names> </name><name name-style="western"><surname>Butt</surname><given-names>H</given-names> </name><name name-style="western"><surname>Goldenholz</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>CY</given-names> </name><name name-style="western"><surname>Goldenholz</surname><given-names>DM</given-names> </name></person-group><article-title>Large language model performance on practice epilepsy board examinations</article-title><source>JAMA Neurol</source><year>2024</year><month>06</month><day>1</day><volume>81</volume><issue>6</issue><fpage>660</fpage><lpage>661</lpage><pub-id pub-id-type="doi">10.1001/jamaneurol.2024.0676</pub-id><pub-id pub-id-type="medline">38587850</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Performance of ChatGPT-4o and four open-source large language models in generating diagnoses based on China&#x2019;s rare disease catalog: comparative study</article-title><source>J Med Internet Res</source><year>2025</year><month>06</month><day>18</day><volume>27</volume><fpage>e69929</fpage><pub-id pub-id-type="doi">10.2196/69929</pub-id><pub-id pub-id-type="medline">40532199</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>W</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>M</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Enhancing the accuracy of human phenotype ontology identification: comparative evaluation of multimodal large language models</article-title><source>J Med Internet Res</source><year>2025</year><month>06</month><day>2</day><volume>27</volume><fpage>e73233</fpage><pub-id pub-id-type="doi">10.2196/73233</pub-id><pub-id pub-id-type="medline">40456109</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shankar</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Dhingra</surname><given-names>LS</given-names> </name><name name-style="western"><surname>Aminorroaya</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Automated transformation of unstructured cardiovascular diagnostic reports into structured datasets using sequentially deployed large language models</article-title><source>Eur Heart J Digit Health</source><year>2025</year><month>07</month><volume>6</volume><issue>4</issue><fpage>783</fpage><lpage>796</lpage><pub-id pub-id-type="doi">10.1093/ehjdh/ztaf030</pub-id><pub-id pub-id-type="medline">40703108</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Somani</surname><given-names>S</given-names> </name><name name-style="western"><surname>Kim</surname><given-names>DD</given-names> </name><name name-style="western"><surname>Perez-Guerrero</surname><given-names>E</given-names> </name><etal/></person-group><article-title>Understanding reasons for oral anticoagulation nonprescription in atrial fibrillation using large language models</article-title><source>J Am Heart Assoc</source><year>2025</year><month>04</month><volume>14</volume><issue>7</issue><fpage>e040419</fpage><pub-id pub-id-type="doi">10.1161/JAHA.124.040419</pub-id><pub-id pub-id-type="medline">40145287</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Holgate</surname><given-names>B</given-names> </name><name name-style="western"><surname>Shek</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Extracting epilepsy-related information from unstructured clinic letters using large language models</article-title><source>Epilepsia</source><year>2025</year><month>09</month><volume>66</volume><issue>9</issue><fpage>3369</fpage><lpage>3384</lpage><pub-id pub-id-type="doi">10.1111/epi.18475</pub-id><pub-id pub-id-type="medline">40637590</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Owens</surname><given-names>D</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>DQ</given-names> </name><name name-style="western"><surname>Dohopolski</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rousseau</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Peterson</surname><given-names>ED</given-names> </name><name name-style="western"><surname>Navar</surname><given-names>AM</given-names> </name></person-group><article-title>Accuracy of large language models to identify stroke subtypes within unstructured electronic health record data</article-title><source>Stroke</source><year>2025</year><month>10</month><volume>56</volume><issue>10</issue><fpage>2966</fpage><lpage>2975</lpage><pub-id pub-id-type="doi">10.1161/STROKEAHA.125.051993</pub-id><pub-id pub-id-type="medline">40709446</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>He</surname><given-names>X</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Automatic quantitative stroke severity assessment based on Chinese clinical named entity recognition with domain-adaptive pre-trained large language model</article-title><source>Artif Intell Med</source><year>2024</year><month>04</month><volume>150</volume><fpage>102822</fpage><pub-id pub-id-type="doi">10.1016/j.artmed.2024.102822</pub-id><pub-id pub-id-type="medline">38553162</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Spitzl</surname><given-names>D</given-names> </name><name name-style="western"><surname>Mergen</surname><given-names>M</given-names> </name><name name-style="western"><surname>Braren</surname><given-names>R</given-names> </name><name name-style="western"><surname>Endr&#x00F6;s</surname><given-names>L</given-names> </name><name name-style="western"><surname>Eiber</surname><given-names>M</given-names> </name><name name-style="western"><surname>Steinhelfer</surname><given-names>L</given-names> </name></person-group><article-title>LLM-powered breast cancer staging from PET/CT reports: a comparative performance study</article-title><source>Int J Med Inform</source><year>2025</year><month>12</month><volume>204</volume><fpage>106053</fpage><pub-id pub-id-type="doi">10.1016/j.ijmedinf.2025.106053</pub-id><pub-id pub-id-type="medline">40706196</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Danhauser</surname><given-names>K</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Klein</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Using large language models to extract information from pediatric clinical reports</article-title><source>PLOS Digit Health</source><year>2025</year><month>07</month><volume>4</volume><issue>7</issue><fpage>e0000919</fpage><pub-id pub-id-type="doi">10.1371/journal.pdig.0000919</pub-id><pub-id pub-id-type="medline">40700460</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Mondillo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Colosimo</surname><given-names>S</given-names> </name><name name-style="western"><surname>Perrotta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Frattolillo</surname><given-names>V</given-names> </name><name name-style="western"><surname>Masino</surname><given-names>M</given-names> </name></person-group><article-title>Comparative evaluation of advanced AI reasoning models in pediatric clinical decision support: chatgpt O1 vs. Deepseek-R1</article-title><source>MedRxiv</source><comment>Preprint posted online on  Jan 28, 2025</comment><pub-id pub-id-type="doi">10.1101/2025.01.27.25321169</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Arrieta</surname><given-names>A</given-names> </name><name name-style="western"><surname>Ugarte</surname><given-names>M</given-names> </name><name name-style="western"><surname>Valle</surname><given-names>P</given-names> </name><name name-style="western"><surname>Parejo</surname><given-names>JA</given-names> </name><name name-style="western"><surname>Segura</surname><given-names>S</given-names> </name></person-group><article-title>O3-mini vs Deepseek-R1: which one is safer?</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 30, 2025</comment><pub-id pub-id-type="doi">10.48550/arXiv.2501.18438</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tordjman</surname><given-names>M</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Yuce</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Comparative benchmarking of the DeepSeek large language model on medical tasks and clinical reasoning</article-title><source>Nat Med</source><year>2025</year><month>08</month><volume>31</volume><issue>8</issue><fpage>2550</fpage><lpage>2555</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03726-3</pub-id><pub-id pub-id-type="medline">40267969</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hegselmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Fujarski</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Benchmark evaluation of DeepSeek large language models in clinical decision-making</article-title><source>Nat Med</source><year>2025</year><month>08</month><volume>31</volume><issue>8</issue><fpage>2546</fpage><lpage>2549</lpage><pub-id pub-id-type="doi">10.1038/s41591-025-03727-2</pub-id><pub-id pub-id-type="medline">40267970</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yan</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Fan</surname><given-names>KQ</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Comparative analysis of the performance of the large language models DeepSeek-V3, DeepSeek-R1, Open AI-O3 mini and Open AI-O3 mini high in urology</article-title><source>World J Urol</source><year>2025</year><month>07</month><day>7</day><volume>43</volume><issue>1</issue><fpage>416</fpage><pub-id pub-id-type="doi">10.1007/s00345-025-05757-4</pub-id><pub-id pub-id-type="medline">40622427</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ming</surname><given-names>S</given-names> </name><name name-style="western"><surname>Yao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>Q</given-names> </name><etal/></person-group><article-title>Evaluation of DeepSeek-R1 for ophthalmic diagnosis and reasoning: a comparison with OpenAI o1 and o3</article-title><source>J Med Syst</source><year>2025</year><month>10</month><day>8</day><volume>49</volume><issue>1</issue><fpage>130</fpage><pub-id pub-id-type="doi">10.1007/s10916-025-02264-2</pub-id><pub-id pub-id-type="medline">41060487</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chai</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>Y</given-names> </name></person-group><article-title>Application of copy number variation sequencing technology in 422 foetuses with abnormal ultrasound soft markers</article-title><source>Int J Womens Health</source><year>2023</year><volume>15</volume><fpage>1791</fpage><lpage>1800</lpage><pub-id pub-id-type="doi">10.2147/IJWH.S429164</pub-id><pub-id pub-id-type="medline">38020944</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><collab>ENSO Working Group</collab></person-group><article-title>Role of prenatal magnetic resonance imaging in fetuses with isolated mild or moderate ventriculomegaly in the era of neurosonography: international multicenter study</article-title><source>Ultrasound Obstet Gynecol</source><year>2020</year><month>09</month><volume>56</volume><issue>3</issue><fpage>340</fpage><lpage>347</lpage><pub-id pub-id-type="doi">10.1002/uog.21974</pub-id><pub-id pub-id-type="medline">31917496</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>G</given-names> </name><etal/></person-group><article-title>A Chinese multicenter retrospective study of isolated increased nuchal translucency associated chromosome anomaly and prenatal diagnostic suggestions</article-title><source>Sci Rep</source><year>2021</year><month>03</month><day>10</day><volume>11</volume><issue>1</issue><fpage>5596</fpage><pub-id pub-id-type="doi">10.1038/s41598-021-85108-6</pub-id><pub-id pub-id-type="medline">33692422</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>X</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Qi</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>When NIPT meets WES, prenatal diagnosticians face the dilemma: genetic etiological analysis of 2,328 cases of NT thickening and follow-up of pregnancy outcomes</article-title><source>Front Genet</source><year>2023</year><volume>14</volume><fpage>1227724</fpage><pub-id pub-id-type="doi">10.3389/fgene.2023.1227724</pub-id><pub-id pub-id-type="medline">37600658</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Fantasia</surname><given-names>I</given-names> </name><name name-style="western"><surname>Catagini</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zamagni</surname><given-names>G</given-names> </name><etal/></person-group><article-title>The clinical impact of the first-trimester nuchal translucency between the 95th-99th percentiles</article-title><source>Prenat Diagn</source><year>2023</year><month>06</month><volume>43</volume><issue>7</issue><fpage>929</fpage><lpage>936</lpage><pub-id pub-id-type="doi">10.1002/pd.6390</pub-id><pub-id pub-id-type="medline">37264704</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yoshida</surname><given-names>S</given-names> </name><name name-style="western"><surname>Miura</surname><given-names>K</given-names> </name><name name-style="western"><surname>Yamasaki</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Does increased nuchal translucency indicate a fetal abnormality? A retrospective study to clarify the clinical significance of nuchal translucency in Japan</article-title><source>J Hum Genet</source><year>2008</year><volume>53</volume><issue>8</issue><fpage>688</fpage><lpage>693</lpage><pub-id pub-id-type="doi">10.1007/s10038-008-0299-6</pub-id><pub-id pub-id-type="medline">18500546</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mellis</surname><given-names>R</given-names> </name><name name-style="western"><surname>Eberhardt</surname><given-names>RY</given-names> </name><name name-style="western"><surname>Hamilton</surname><given-names>SJ</given-names> </name><etal/></person-group><article-title>Fetal exome sequencing for isolated increased nuchal translucency: should we be doing it?</article-title><source>BJOG</source><year>2022</year><month>01</month><volume>129</volume><issue>1</issue><fpage>52</fpage><lpage>61</lpage><pub-id pub-id-type="doi">10.1111/1471-0528.16869</pub-id><pub-id pub-id-type="medline">34411415</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>W</given-names> </name></person-group><article-title>An adaptive &#x201C;fast-slow&#x201D; large language model framework for multi-dimensional classification of prenatal ultrasound reports</article-title><source>Zenodo</source><year>2025</year><access-date>2026-05-13</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://zenodo.org/records/16788862">https://zenodo.org/records/16788862</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>The full anonymized prenatal-ultrasound abnormality dataset, 5 classification schemes with expert scores, DeepSeek large language model prompts, and severity assessment before and after RAG.</p><media xlink:href="jmir_v28i1e91399_app1.xlsx" xlink:title="XLSX File, 105 KB"/></supplementary-material></app-group></back></article>