<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e67469</article-id><article-id pub-id-type="doi">10.2196/67469</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Large Language Models in Randomized Controlled Trials Design: Observational Study</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Jin</surname><given-names>Liyuan</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Ong</surname><given-names>Jasmine Chiat Ling</given-names></name><degrees>PharmD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Elangovan</surname><given-names>Kabilan</given-names></name><degrees>BE</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ke</surname><given-names>Yuhe</given-names></name><degrees>MBBS</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff4">4</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pyle</surname><given-names>Alexandra</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ting</surname><given-names>Daniel Shu Wei</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Liu</surname><given-names>Nan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff6">6</xref></contrib></contrib-group><aff id="aff1"><institution>Duke-NUS Medical School</institution><addr-line>8 College Road</addr-line><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff2"><institution>Division of Pharmacy, Singapore General Hospital</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff3"><institution>Artificial Intelligence Office, SingHealth</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff4"><institution>Department of Anaesthesiology and Perioperative Medicine, Singapore General Hospital</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff5"><institution>Singapore National Eye Centre</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><aff id="aff6"><institution>NUS Artificial Intelligence Institute, National University of Singapore</institution><addr-line>Singapore</addr-line><country>Singapore</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Dingqiao</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Desai</surname><given-names>Neil</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Nan Liu, PhD, Duke-NUS Medical School, 8 College Road, Singapore, 169857, Singapore, 65 66016503; <email>liu.nan@duke-nus.edu.sg</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>3</day><month>9</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e67469</elocation-id><history><date date-type="received"><day>12</day><month>10</month><year>2024</year></date><date date-type="rev-recd"><day>26</day><month>04</month><year>2025</year></date><date date-type="accepted"><day>28</day><month>04</month><year>2025</year></date></history><copyright-statement>&#x00A9; Liyuan Jin, Jasmine Chiat Ling Ong, Kabilan Elangovan, Yuhe Ke, Alexandra Pyle, Daniel Shu Wei Ting, Nan Liu. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 3.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e67469"/><abstract><sec><title>Background</title><p>Randomized controlled trials (RCTs) face challenges such as limited generalizability, insufficient recruitment diversity, and high failure rates, often due to restrictive eligibility criteria and inefficient patient selection. Large language models (LLMs) have shown promise in various clinical tasks, but their potential role in RCT design remains underexplored.</p></sec><sec><title>Objective</title><p>This study investigates the ability of LLMs, specifically GPT-4-Turbo-Preview, to assist in designing RCTs that enhance generalizability, recruitment diversity, and reduce failure rates, while maintaining clinical safety and ethical standards.</p></sec><sec sec-type="methods"><title>Methods</title><p>We conducted a noninterventional, observational study analyzing 20 parallel-arm RCTs, comprising 10 completed and 10 registered studies published after January 2024 to mitigate pretraining biases. The LLM was tasked with generating RCT designs based on input criteria, including eligibility, recruitment strategies, interventions, and outcomes. The accuracy of LLM-generated designs was quantitatively assessed by 2 independent clinical experts by comparing them to clinically validated ground truth data from ClinicalTrials.gov. We have conducted statistical analysis using natural language processing&#x2013;based methods, including Bilingual Evaluation Understudy (BLEU), Recall-Oriented Understudy for Gisting Evaluation (ROUGE)-L, and Metric for Evaluation of Translation with Explicit ORdering (METEOR), for objective scoring on corresponding LLM outputs. Qualitative assessments were performed using Likert scale ratings (1-3) for domains such as safety, clinical accuracy, objectivity or bias, pragmatism, inclusivity, and diversity.</p></sec><sec sec-type="results"><title>Results</title><p>The LLM achieved an overall accuracy of 72% in replicating RCT designs. Recruitment and intervention designs demonstrated high agreement with the ground truth, achieving 88% and 93% accuracy, respectively. However, LLMs showed lower accuracy in designing eligibility criteria (55%) and outcomes measurement (53%). Natural language processing statistical analysis reported BLEU=0.04, ROUGE-L=0.20, and METEOR=0.18 on average objective scoring of LLM outputs. Qualitative evaluations showed that LLM-generated designs scored above 2 points and closely matched the original designs in scores across all domains, indicating strong clinical alignment. Specifically, both original and LLM-based designs ranked similarly high in safety, clinical accuracy, and objectivity or bias in published RCTs. Moreover, LLM-based design ranked noninferior to original designs in registered RCTs in multiple domains. In particular, LLMs enhanced diversity and pragmatism, which are key factors in improving RCT generalizability and addressing failure rates.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>LLMs, such as GPT-4-Turbo-Preview, have demonstrated potential in improving RCT design, particularly in recruitment and intervention planning, while enhancing generalizability and addressing diversity. However, expert oversight and regulatory measures are essential to ensure patient safety and ethical standards. The findings support further integration of LLMs into clinical trial design, although continued refinement is necessary to address limitations in eligibility and outcomes measurement.</p></sec></abstract><kwd-group><kwd>GPT-4</kwd><kwd>LLM-generated clinical trial designs</kwd><kwd>clinical trial design evaluation</kwd><kwd>recruitment diversity</kwd><kwd>eligibility criteria</kwd><kwd>clinical research ethics</kwd><kwd>trial failure reduction</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Randomized controlled trials (RCTs) serve as the backbone of modern evidence-based clinical practice [<xref ref-type="bibr" rid="ref1">1</xref>]. RCT provides a carefully controlled environment to investigate cause-effect relationships between therapeutic intervention and clinical outcomes with a high degree of internal validity [<xref ref-type="bibr" rid="ref2">2</xref>]. Over the years, landmark RCTs have significantly influenced treatment guidelines and improved global standards of care across various medical disciplines [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref5">5</xref>].</p><p>However, despite their scientific rigor in evidence, RCTs face persistent and well-documented criticisms of poor generalizability from fixed eligibility criteria [<xref ref-type="bibr" rid="ref6">6</xref>], lack of diversification in recruitment [<xref ref-type="bibr" rid="ref7">7</xref>], and practical implementation concerns [<xref ref-type="bibr" rid="ref6">6</xref>]. Patients with complex comorbidities or late-stage diseases excluded from phase 3 trials fail to benefit from breakthrough discoveries in real-world practice. Thus, challenges need to be addressed to maximize the yield of each study.</p><p>In addition to concerns about representativeness, clinical trials face an alarmingly high failure rate, especially in the later stages of development. High failure rate of clinical trials is a key stumbling block in drug development pipelines. RCTs&#x2019; failure rate has been reported for various reasons [<xref ref-type="bibr" rid="ref8">8</xref>-<xref ref-type="bibr" rid="ref10">10</xref>], including safety and toxicity concerns, poor accrual and recruitment challenges, logistics, and funding. Of which, a key contributory factor to the failure of phase 3 trials is an inefficient patient selection process [<xref ref-type="bibr" rid="ref11">11</xref>]. Failure of clinical trials bears significant implications for both drug development companies and patients. Clinical research remains the most expensive and time-consuming process of drug development, costing up to a billion dollars in investment and taking more than a decade of work to bring a new drug to market [<xref ref-type="bibr" rid="ref12">12</xref>]. Reform of clinical research is much needed to accelerate this process.</p><p>Given the immense time, cost, and effort involved in clinical research, there is an urgent need to reform the RCT design process to address the aforementioned challenges. Emerging technologies, particularly large language models (LLMs), offer a novel opportunity to address these challenges. LLMs have recently emerged as an efficient tool in various clinical tasks [<xref ref-type="bibr" rid="ref13">13</xref>] with comparable clinical alignment to human experts [<xref ref-type="bibr" rid="ref14">14</xref>]. Developments in natural language processing (NLP) empowered LLMs to generate sophisticated and contextually relevant clinical content. Prominent examples, including GPT-4, Gemini, Llama 3, and Claude 3.5, have showcased remarkable versatility and clinical performance in highly specialized clinical tasks [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>]. As a result, LLM tools are expected to assist clinical practice ranging from basic health care&#x2013;related administrative work [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref18">18</xref>], educational chatbots for medical knowledge [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref20">20</xref>], to advanced clinical notes generation [<xref ref-type="bibr" rid="ref21">21</xref>-<xref ref-type="bibr" rid="ref23">23</xref>], complex clinical cases diagnosis [<xref ref-type="bibr" rid="ref24">24</xref>], and patient triaging [<xref ref-type="bibr" rid="ref25">25</xref>,<xref ref-type="bibr" rid="ref26">26</xref>].</p><p>Recently, there has been increasing interest in LLM applications in clinical trials [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref30">30</xref>]. Generative artificial intelligence introduced new paradigms in drug development, from the design and validation of novel pharmaceutical compounds to eligibility screening of patients for clinical trials [<xref ref-type="bibr" rid="ref27">27</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. These approaches show promise in streamlining clinical research but fail to address problems related to trial design and generalizability of RCTs, including eligibility criteria, diversification, and practicability. RCTs provide the highest level of scientific evidence of therapeutic interventions, and their design requires in-depth clinical understanding and rigorous scientific methodologies [<xref ref-type="bibr" rid="ref31">31</xref>-<xref ref-type="bibr" rid="ref33">33</xref>].</p><p>In this study, we explore the application of LLMs as a tool for designing RCTs with clinical alignment and broader applicability. By piloting the use of LLMs in trial design, we aim to assess their potential to enhance the generalizability of study outcomes, optimize eligibility criteria, and ultimately reduce the failure rate of phase 3 clinical trials. This work contributes to the evolving dialogue on the future of clinical research and offers a practical pathway toward more inclusive, efficient, and evidence-driven trial methodologies.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>We performed an observational, noninterventional study using GPT-4-Turbo-Preview as a state-of-the-art LLM for designing RCTs.</p></sec><sec id="s2-2"><title>Validation and Testing Datasets</title><p>We randomly selected 20 parallel-arm RCTs (phase 3 or 4): 10 completed RCTs, with results published in leading clinical journals (JAMA, Nature Medicine, NEJM, and The Lancet); and 10 RCTs registered on ClinicalTrials.gov. To mitigate the risks of LLMs&#x2019; pretraining use in such studies, we used studies published or newly registered after January 2024 (after the GPT-4-Turbo-Preview pretraining date of December 2023). Details of the dataset are presented in Table S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-3"><title>Reference Standard and LLM Prompt</title><p>We extracted the respective study designs from ClinicalTrials.gov (information cross-checked against publication if available), to serve as our ground truth. We provided the LLM with the following inputs: official titles, brief summaries, study type, study phase, study design, conditions, and intervention or treatment. We then prompted the LLM for the following outputs: eligibility criteria (inclusion and exclusion criteria), recruitment (sex or gender and age), arm or intervention (active and control arms), and outcomes measurement (measurement design and measurement time frame).</p></sec><sec id="s2-4"><title>Large Language Model</title><p>In this study, we selected GPT-4-Turbo-Preview. We chose a temperature of 0.2 to balance replicability and clinical rigor. Detailed prompts and output are presented in Figure S1 and Table S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, respectively.</p></sec><sec id="s2-5"><title>Quantitative Evaluation</title><p>We quantitatively evaluated the accuracy (degree of agreement) of the LLMs&#x2019; outputs by comparing them with the clinically defined ground truth. We first collect ground truth for published studies from the publication (cross-examined with the corresponding study from ClinicalTrials.gov), and recent registered trials from ClinicalTrials.gov. For outputs with numerical or categorical answers, such as gender or age in recruitment and measurement time frame in outcome measures, we define correct answers as completely matching numerical values in the ground truth. For outputs with clinical answers, such as eligibility criteria, active and control arms in intervention, and measurement design in outcome measures, we defined answers as correct if clinically aligned with the ground truth. Specifically, for eligibility criteria designs, the accuracy was determined by the number of matched LLM designs divided by the total number of eligibility criteria listed by LLM.</p><p>We created a qualitative assessment metric to evaluate both LLM and ground truth designs. This metric comprised safety, clinical accuracy, objectivity (bias), pragmatic (adapted from PRECIS-2 guidance) [<xref ref-type="bibr" rid="ref34">34</xref>], inclusivity, and diversity (adapted from United States Food and Drug Administration [FDA] draft guidance to clinical trial design) [<xref ref-type="bibr" rid="ref7">7</xref>] measured on a 3-point Likert Scale (1 is the worst and 3 is the best). For selected registered RCT studies, we performed a blinded qualitative evaluation without knowledge of ground truth designs to provide a more objective analysis. Mean scores were calculated based on blinded human expert ratings stratified into RCTs (published and registered) with designs (ground truths and LLM designs).</p></sec><sec id="s2-6"><title>Statistical Analysis</title><p>We used average, nonweighted NLP-based objective scoring, including Bilingual Evaluation Understudy (BLEU), Recall-Oriented Understudy for Gisting Evaluation (ROUGE)-L, and Metric for Evaluation of Translation with Explicit ORdering (METEOR) for LLM outputs.</p></sec><sec id="s2-7"><title>Ethical Considerations</title><p>As this study is retrospective in nature and no real patient was involved in the current research, regulatory approval and informed consent are not applicable. Human clinical experts (reviewer 1&#x2013;principal clinical pharmacist; reviewer 2&#x2013;specialist physician in anesthesia, both with &#x003E;10 years of clinical practice experience) received no compensation for rating.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>Our results show that LLM demonstrated 72% accuracy in overall RCT designs (stratified performance across different design domains is presented in Figure S2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Specifically, it showed high agreement in Recruitment and Arm or Intervention, with accuracy of 88% and 93%, respectively. However, it demonstrated discrepancies in designing Eligibility Criteria and Outcomes Measurement, with an accuracy of 55% and 53%, respectively. We observed marginal differences in accuracy between LLM outputs and both published RCTs and registered RCTs, except for an improvement in exclusion criteria designs in the latest RCTs. We used statistical analysis using NLP-based methods, including BLEU [<xref ref-type="bibr" rid="ref35">35</xref>], ROUGE-L [<xref ref-type="bibr" rid="ref36">36</xref>], and METEOR [<xref ref-type="bibr" rid="ref37">37</xref>], for corresponding LLM outputs, presented in Table S3 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Specifically, BLEU [<xref ref-type="bibr" rid="ref35">35</xref>] measures n-gram precision to evaluate textual similarity, ROUGE-L [<xref ref-type="bibr" rid="ref36">36</xref>] focuses on sequence recall and fluency by identifying the longest common subsequences, and METEOR [<xref ref-type="bibr" rid="ref37">37</xref>] assesses semantic alignment and linguistic variability, incorporating synonyms, stemming, and word order. These metrics collectively provide a comprehensive evaluation of the generated outputs against the reference text. Qualitatively, LLM designs produced comparable clinical alignment, as observed in closely matched Likert scales, RCT design compared to ground truth, with Likert scales scoring above 2 points across all domains (<xref ref-type="fig" rid="figure1">Figure 1</xref>, grading scores were presented in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p><p>Our findings suggest that LLM, represented by GPT-4-Turbo-Preview in this study, can replicate RCT designs with reasonable clinical alignment. LLM was able to match RCTs with over 80% accuracy in designing Recruitment requirements and Active or Control Intervention. When assessed qualitatively, we observed marginal differences in the overall clinical accuracy of the LLM design compared with the ground truth, highlighting multiple accepted clinical decisions related to RCT design. Upon qualitative analysis, LLM-based RCT designs closely aligned with documented consensus in safe, accurate, and objective domains, while showing enhanced diversity and pragmatism. Notably, diversity and pragmatism are key determinants of LLM generalizability and reasons for RCT failure. In addition, LLM could avoid critical safety and ethical issues identified in the ground truth from the analysis of the selected registered RCTs.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>(A) Qualitative metrics for 10 published RCTs. (B) Qualitative metrics for 10 registered RCTs.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e67469_fig01.png"/></fig></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>RCTs serve key roles in clinical practice, and inclusivity has been heavily emphasized by the FDA [<xref ref-type="bibr" rid="ref38">38</xref>] to ensure consistently high-quality design that is scientifically justifiable. Current results highlight the potential role of LLM for such an important design principle. Unique attributes of LLM architecture bring distinct advantages over conventional deep learning and NLP in text-based comprehension capabilities. General-purpose LLMs such as GPT-4 can perform tasks with little or no task-specific fine-tuning. Extensive pretraining on medically related free texts sets them apart from conventional machine learning or deep learning models, simulating clinical reasoning and inferential skills across diverse disciplines [<xref ref-type="bibr" rid="ref39">39</xref>], allowing potential integration into sophisticated clinical tasks such as in clinical trial design. We infer that LLM could recommend the most commonly used comparator arms for trials of similar nature and discipline; logical deduction of active intervention dosage regimen based on preclinical or phase 1 and phase 2 published studies captured in its knowledge corpus.</p><p>Recommended exclusion criteria and outcome measurement time frames differed to a greater extent between LLM-designed trials and the actual published design. These design elements often vary widely across different studies and interventions tested in the real world. Qualitatively, the overall safety and clinical accuracy of these reported differences was not compromised significantly. Stronger performance in recruitment and intervention might be partially explained by the fact that LLMs are trained on previous examples of clinical trial designs, with better understanding in predicting sample sizes for inclusion and standard therapeutic intervention regimes. However, inferior performance in eligibility criteria designs and outcomes measurement emphasizes that critical clinical insights are necessary to facilitate clinically relevant clinical trial designs. Overall, LLM-based clinical trial designs might benefit more administrative aspects of clinical trial design, such as formulating standard intervention regimes and determining patient sample size, while further improvements are necessary to allow designs for highly specialized clinical trial&#x2013;related domains. Coupled with further tailored RCT designs through prompting with LLMs regarding various patient and condition-related concerns, as well as financial and pragmatic challenges, the current pilot LLM-based RCT framework is expected to improve generalizability, enhance patient recruitment, and reduce RCT failure rates.</p></sec><sec id="s4-2"><title>Limitations</title><p>Our study has the following limitations. First, the generalizability of our findings is constrained by the specific LLM architecture used, GPT-4-Turbo-Preview, which may not reflect the performance of other LLMs or future versions. Although both human reviewers were experienced clinicians, the lack of a broader multidisciplinary review panel may limit the generalizability of the qualitative findings. Future studies could incorporate more diverse expert raters and a certified medical board. Our analysis was limited to text-based outputs, which do not capture the full complexity of clinical trial design, such as availability of funding, ease of patient recruitment, and ethical considerations. The study also relied on a relatively small sample of RCT designs, which may not provide a comprehensive view of the LLMs&#x2019; capabilities across diverse medical specialties. Future studies with larger sample sizes, expanding LLMs of interest for evaluation, and cost-effectiveness analysis stratified by various medical specialties are necessary. Furthermore, for phase 3 and phase 4 trials, substantial work including prior registration and funding would have been published and would affect the interpretation of this study toward the approach of LLM-based RCT designs. Future studies on LLM design from the initial hypothesis and direct comparison with concurrent human expert designs are necessary. Finally, alternative trial designs such as open-label, crossover, or pragmatic trials were not considered in this study.</p></sec><sec id="s4-3"><title>Comparison With Prior Work</title><p>To identify relevant studies, we used the following literature search strategy: (&#x201C;clinical trials as topic&#x201D; [MeSH Terms] OR &#x201C;randomized controlled trials as topic&#x201D; [MeSH Terms] OR &#x201C;clinical trial&#x201D; [Title or Abstract]) AND (&#x201C;artificial intelligence&#x201D; [MeSH Terms] OR &#x201C;generative AI&#x201D; [Title or Abstract] OR &#x201C;language model&#x201D; [Title or Abstract]) AND (2022:2024[pdat]). We restricted the search to articles published in PubMed between January 1, 2022, and April 1, 2024. We screened a total of 575 articles from PubMed and included a final total of 6 publications. We included peer-reviewed articles investigating the performance of generative artificial intelligence models applied in the conduct of clinical trials or RCTs. We excluded review papers and studies that did not report any model performance.</p><p>Existing clinical trial&#x2013;related LLM studies, presented in <xref ref-type="table" rid="table1">Table 1</xref>, have only focused on preliminary text classification tasks and are mostly limited to last-generation LLMs, such as Bidirectional Encoder Representations from Transformers (BERT) [<xref ref-type="bibr" rid="ref40">40</xref>]. For instance, performance over eligibility criteria recognition achieved a moderate <italic>F</italic><sub>1</sub>-score over BERT-related LLMs [<xref ref-type="bibr" rid="ref41">41</xref>]. AutoCriteria, leveraging GPT-4 in a zero-shot setting, significantly improved entity extraction across multiple diseases, highlighting the promise of the latest LLMs [<xref ref-type="bibr" rid="ref42">42</xref>]. Other efforts include classifying exclusion criteria in cancer trials using BERT, again demonstrating LLM feasibility in clinical tasks [<xref ref-type="bibr" rid="ref43">43</xref>]. GPT-4 has also been explored for sample size calculation, but observed inconsistencies underscore the need for caution in high-stakes applications [<xref ref-type="bibr" rid="ref44">44</xref>]. In addition, predictive modeling of trial publication outcomes using BERT demonstrated the utility of LLM in combining structured and unstructured clinical trial data [<xref ref-type="bibr" rid="ref45">45</xref>]. With rapid advancement in LLM development and taking advantage of LLMs&#x2019; accessibility and efficiency as demonstrated in this study, it holds great promise as an assistive tool for RCT design. In our quantitative analysis, LLMs could recommend study designs using gold standard control groups and appropriate active group interventions.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Existing large language model applications in clinical trials&#x2212;related studies.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Studies</td><td align="left" valign="bottom">LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> application</td><td align="left" valign="bottom">LLM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup> base model</td><td align="left" valign="bottom">Testing dataset sample size</td><td align="left" valign="bottom">Evaluation metrics used</td><td align="left" valign="bottom">Model performance</td></tr></thead><tbody><tr><td align="left" valign="top">A comparative study of pretrained language models for named entity recognition in clinical trial eligibility criteria from multiple corpora [<xref ref-type="bibr" rid="ref41">41</xref>]</td><td align="left" valign="top">Eligibility screening</td><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">470/230/1000</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.72/0.84/0.62</td></tr><tr><td align="left" valign="top">AutoCriteria: a generalizable clinical trial eligibility criteria extraction system powered by large language models [<xref ref-type="bibr" rid="ref42">42</xref>]</td><td align="left" valign="top">Eligibility screening</td><td align="left" valign="top">GPT-4<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">180 trials</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.90</td></tr><tr><td align="left" valign="top">Text classification of cancer clinical trial eligibility criteria [<xref ref-type="bibr" rid="ref43">43</xref>]</td><td align="left" valign="top">Eligibility screening</td><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">764 trials</td><td align="left" valign="top">ACC<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></td><td align="left" valign="top">0.27&#x2010;0.95</td></tr><tr><td align="left" valign="top">ChatGPT for sample size calculation in sports medicine and exercise sciences: a cautionary note [<xref ref-type="bibr" rid="ref44">44</xref>]</td><td align="left" valign="top">Sample size calculation</td><td align="left" valign="top">GPT 4<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></td><td align="left" valign="top">4 trials</td><td align="left" valign="top">ACC</td><td align="left" valign="top">0.75</td></tr><tr><td align="left" valign="top">Medical text classification based on the discriminative pretraining model and prompt-tuning [<xref ref-type="bibr" rid="ref46">46</xref>]</td><td align="left" valign="top">Assist trial outcome measurement</td><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">5127 outcome entities</td><td align="left" valign="top">ACC</td><td align="left" valign="top">0.86</td></tr><tr><td align="left" valign="top">Predicting publication of clinical trials using structured and unstructured data: model development and validation study [<xref ref-type="bibr" rid="ref45">45</xref>]</td><td align="left" valign="top">Trial outcome prediction</td><td align="left" valign="top">BERT<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td><td align="left" valign="top">76,950 trials</td><td align="left" valign="top"><italic>F</italic><sub>1</sub>-score</td><td align="left" valign="top">0.70</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LLM: large language models</p></fn><fn id="table1fn2"><p><sup>b</sup>BERT: Bidirectional Encoder Representations from Transformers</p></fn><fn id="table1fn3"><p><sup>c</sup>GPT: Generative Pre-trained Transformer 4</p></fn><fn id="table1fn4"><p><sup>d</sup>ACC: accuracy.</p></fn></table-wrap-foot></table-wrap><p>This study contributes significantly to the existing literature by providing empirical data on the accuracy and clinical alignment of LLMs specifically in the context of RCT design. Unlike previous studies, which primarily focus on preliminary text classification tasks, our research applied LLMs to the comprehensive design of RCTs, including elements such as eligibility criteria, recruitment strategies, and intervention arms. Our findings demonstrate that LLMs can replicate existing RCT designs with reasonable accuracy and add value by enhancing the diversity and pragmatism of trial designs. This is crucial in addressing common pitfalls in RCT generalizability and participant diversity. Various factors affect and influence clinical trial accessibility, and a comprehensive, multipronged approach is required. Other factors include the lack of education on the benefits of participating in clinical trials, patient trust, and the lack of incentives to participate [<xref ref-type="bibr" rid="ref47">47</xref>]. The design of the clinical trial may inadvertently pose a barrier to entry. Clinical trials often exclude certain populations to a greater extent than others, such as patients with late-stage organ dysfunction.</p><p>Amid the growing interest in the use of LLMs to accelerate clinical trial processes, there is still a paucity of tools developed to improve the overall quality and inclusivity of clinical trials. Our study demonstrated that LLM is capable of assisting in trial design, encompassing elements of &#x201C;best practices in clinical trial designs.&#x201D; This can serve as a good reference point for nonsubject matter experts, including scientific review committees and ethics boards. Moving forward, the development of LLM-based agentic artificial intelligence workflows could further improve the utility and performance of LLMs in this application. Specialized LLM agents can be developed and incorporated into a multistep &#x201C;checklist&#x201D; approach to perform critical review and evaluation of various domains of a clinical trial design. Multiagent conversations have been shown to improve LLM output accuracy and mitigate cognitive bias [<xref ref-type="bibr" rid="ref48">48</xref>].</p></sec><sec id="s4-4"><title>Conclusions</title><p>This study highlights the potential of LLMs to enhance RCT design, achieving substantial accuracy with key improvements in diversity and pragmatism. Such advancements could significantly improve the efficiency and effectiveness of clinical trials, driving faster development of therapeutic interventions. While LLMs show promise, expert oversight remains crucial for ensuring safety and ethics. Future efforts should aim to better integrate LLMs within clinical research frameworks and develop adaptive regulatory measures.</p></sec></sec></body><back><ack><p>This work was supported by the Duke-NUS Signature Research Program, funded by the Ministry of Health, Singapore. The funder had no role in study design, conduct, data analysis, and interpretation. Any opinions, findings, and conclusions or recommendations expressed in this material are those of the authors and do not reflect the views of the Singapore Ministry of Health.</p></ack><notes><sec><title>Data Availability</title><p>Data are supplied in supporting files available for download along with the published manuscript.</p></sec></notes><fn-group><fn fn-type="con"><p>LJ and JCLO contributed equally to this work. DSWT and NL were responsible for conceptualization. LJ, JCLO, and KE carried out the methodology and investigation. JCLO and YK performed the formal analysis and validation. The original draft was written by LJ, JCLO, KE, YK, and AP. LJ, JCLO, KE, and NL reviewed and edited the manuscript. DSWT and NL supervised the project. Project administration was carried out by NL.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb2">BLEU</term><def><p>bilingual evaluation understudy</p></def></def-item><def-item><term id="abb3">FDA</term><def><p>US Food and Drug Administration</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language models</p></def></def-item><def-item><term id="abb5">METEOR</term><def><p>Metric for Evaluation of Translation with Explicit Ordering</p></def></def-item><def-item><term id="abb6">NLP</term><def><p>natural language processing</p></def></def-item><def-item><term id="abb7">RCT</term><def><p>randomized controlled trial</p></def></def-item><def-item><term id="abb8">ROUGE</term><def><p>Recall-Oriented Understudy for Gisting Evaluation</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bothwell</surname><given-names>LE</given-names> </name><name name-style="western"><surname>Podolsky</surname><given-names>SH</given-names> </name></person-group><article-title>The emergence of the randomized, controlled trial</article-title><source>N Engl J Med</source><year>2016</year><month>08</month><day>11</day><volume>375</volume><issue>6</issue><fpage>501</fpage><lpage>504</lpage><pub-id pub-id-type="doi">10.1056/NEJMp1604635</pub-id><pub-id pub-id-type="medline">27509097</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hopewell</surname><given-names>S</given-names> </name><name name-style="western"><surname>Chan</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name><etal/></person-group><article-title>CONSORT 2025 statement: updated guideline for reporting randomised trials</article-title><source>Lancet</source><year>2025</year><month>04</month><day>14</day><volume>405</volume><issue>10489</issue><fpage>1633</fpage><lpage>1640</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(25)00672-5</pub-id><pub-id pub-id-type="medline">40245901</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><article-title>Intensive blood-glucose control with sulphonylureas or insulin compared with conventional treatment and risk of complications in patients with type 2 diabetes (UKPDS 33). UK Prospective Diabetes Study (UKPDS) Group</article-title><source>Lancet</source><year>1998</year><month>09</month><day>12</day><volume>352</volume><issue>9131</issue><fpage>837</fpage><lpage>853</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(98)07019-6</pub-id><pub-id pub-id-type="medline">9742976</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kass</surname><given-names>MA</given-names> </name><name name-style="western"><surname>Heuer</surname><given-names>DK</given-names> </name><name name-style="western"><surname>Higginbotham</surname><given-names>EJ</given-names> </name><etal/></person-group><article-title>The ocular hypertension treatment study: a randomized trial determines that topical ocular hypotensive medication delays or prevents the onset of primary open-angle glaucoma</article-title><source>Arch Ophthalmol</source><year>2002</year><month>06</month><volume>120</volume><issue>6</issue><fpage>701</fpage><lpage>713</lpage><pub-id pub-id-type="doi">10.1001/archopht.120.6.701</pub-id><pub-id pub-id-type="medline">12049574</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wykoff</surname><given-names>CC</given-names> </name><name name-style="western"><surname>Abreu</surname><given-names>F</given-names> </name><name name-style="western"><surname>Adamis</surname><given-names>AP</given-names> </name><etal/></person-group><article-title>Efficacy, durability, and safety of intravitreal faricimab with extended dosing up to every 16 weeks in patients with diabetic macular oedema (YOSEMITE and RHINE): two randomised, double-masked, phase 3 trials</article-title><source>Lancet</source><year>2022</year><month>02</month><day>19</day><volume>399</volume><issue>10326</issue><fpage>741</fpage><lpage>755</lpage><pub-id pub-id-type="doi">10.1016/S0140-6736(22)00018-6</pub-id><pub-id pub-id-type="medline">35085503</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nichol</surname><given-names>AD</given-names> </name><name name-style="western"><surname>Bailey</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cooper</surname><given-names>DJ</given-names> </name><collab>POLAR</collab><collab>EPO Investigators</collab></person-group><article-title>Challenging issues in randomised controlled trials</article-title><source>Injury</source><year>2010</year><month>07</month><volume>41 Suppl 1</volume><fpage>S20</fpage><lpage>3</lpage><pub-id pub-id-type="doi">10.1016/j.injury.2010.03.033</pub-id><pub-id pub-id-type="medline">20413119</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gray</surname><given-names>DM</given-names>  <suffix>II</suffix></name><name name-style="western"><surname>Nolan</surname><given-names>TS</given-names> </name><name name-style="western"><surname>Gregory</surname><given-names>J</given-names> </name><name name-style="western"><surname>Joseph</surname><given-names>JJ</given-names> </name></person-group><article-title>Diversity in clinical trials: an opportunity and imperative for community engagement</article-title><source>Lancet Gastroenterol Hepatol</source><year>2021</year><month>08</month><volume>6</volume><issue>8</issue><fpage>605</fpage><lpage>607</lpage><pub-id pub-id-type="doi">10.1016/S2468-1253(21)00228-4</pub-id><pub-id pub-id-type="medline">34246352</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Stensland</surname><given-names>KD</given-names> </name><name name-style="western"><surname>DePorto</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ryan</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Estimating the rate and reasons of clinical trial failure in urologic oncology</article-title><source>Urol Oncol</source><year>2021</year><month>03</month><volume>39</volume><issue>3</issue><fpage>154</fpage><lpage>160</lpage><pub-id pub-id-type="doi">10.1016/j.urolonc.2020.10.070</pub-id><pub-id pub-id-type="medline">33257221</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>CH</given-names> </name><name name-style="western"><surname>Siah</surname><given-names>KW</given-names> </name><name name-style="western"><surname>Lo</surname><given-names>AW</given-names> </name></person-group><article-title>Estimation of clinical trial success rates and related parameters</article-title><source>Biostatistics</source><year>2019</year><month>04</month><day>1</day><volume>20</volume><issue>2</issue><fpage>273</fpage><lpage>286</lpage><pub-id pub-id-type="doi">10.1093/biostatistics/kxx069</pub-id><pub-id pub-id-type="medline">29394327</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hwang</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Carpenter</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lauffenburger</surname><given-names>JC</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>B</given-names> </name><name name-style="western"><surname>Franklin</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Kesselheim</surname><given-names>AS</given-names> </name></person-group><article-title>Failure of investigational drugs in late-stage clinical development and publication of trial results</article-title><source>JAMA Intern Med</source><year>2016</year><month>12</month><day>1</day><volume>176</volume><issue>12</issue><fpage>1826</fpage><lpage>1833</lpage><pub-id pub-id-type="doi">10.1001/jamainternmed.2016.6008</pub-id><pub-id pub-id-type="medline">27723879</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Harrer</surname><given-names>S</given-names> </name><name name-style="western"><surname>Shah</surname><given-names>P</given-names> </name><name name-style="western"><surname>Antony</surname><given-names>B</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name></person-group><article-title>Artificial intelligence for clinical trial design</article-title><source>Trends Pharmacol Sci</source><year>2019</year><month>08</month><volume>40</volume><issue>8</issue><fpage>577</fpage><lpage>591</lpage><pub-id pub-id-type="doi">10.1016/j.tips.2019.05.005</pub-id><pub-id pub-id-type="medline">31326235</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hutson</surname><given-names>M</given-names> </name></person-group><article-title>How AI is being used to accelerate clinical trials</article-title><source>Nature New Biol</source><year>2024</year><month>03</month><volume>627</volume><issue>8003</issue><fpage>S2</fpage><lpage>S5</lpage><pub-id pub-id-type="doi">10.1038/d41586-024-00753-x</pub-id><pub-id pub-id-type="medline">38480968</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSJ</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Gutierrez</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>TF</given-names> </name><name name-style="western"><surname>Ting</surname><given-names>DSW</given-names> </name></person-group><article-title>Large language models in medicine</article-title><source>Nat Med</source><year>2023</year><month>08</month><volume>29</volume><issue>8</issue><fpage>1930</fpage><lpage>1940</lpage><pub-id pub-id-type="doi">10.1038/s41591-023-02448-8</pub-id><pub-id pub-id-type="medline">37460753</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Singhal</surname><given-names>K</given-names> </name><name name-style="western"><surname>Azizi</surname><given-names>S</given-names> </name><name name-style="western"><surname>Tu</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Large language models encode clinical knowledge</article-title><source>Nature New Biol</source><year>2023</year><month>08</month><volume>620</volume><issue>7972</issue><fpage>172</fpage><lpage>180</lpage><pub-id pub-id-type="doi">10.1038/s41586-023-06291-2</pub-id><pub-id pub-id-type="medline">37438534</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ke</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>L</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Retrieval augmented generation for 10 large language models and its generalizability in assessing medical fitness</article-title><source>NPJ Digit Med</source><year>2025</year><month>04</month><day>5</day><volume>8</volume><issue>1</issue><fpage>187</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01519-z</pub-id><pub-id pub-id-type="medline">40185842</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>DYZ</given-names> </name><name name-style="western"><surname>Ke</surname><given-names>YH</given-names> </name><name name-style="western"><surname>Sng</surname><given-names>GGR</given-names> </name><name name-style="western"><surname>Tung</surname><given-names>JYM</given-names> </name><name name-style="western"><surname>Chai</surname><given-names>JX</given-names> </name><name name-style="western"><surname>Abdullah</surname><given-names>HR</given-names> </name></person-group><article-title>Large language models in anaesthesiology: use of ChatGPT for American Society of Anesthesiologists physical status classification</article-title><source>Br J Anaesth</source><year>2023</year><month>09</month><volume>131</volume><issue>3</issue><fpage>e73</fpage><lpage>e75</lpage><pub-id pub-id-type="doi">10.1016/j.bja.2023.06.052</pub-id><pub-id pub-id-type="medline">37474421</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Karakas</surname><given-names>C</given-names> </name><name name-style="western"><surname>Brock</surname><given-names>D</given-names> </name><name name-style="western"><surname>Lakhotia</surname><given-names>A</given-names> </name></person-group><article-title>Leveraging ChatGPT in the pediatric neurology clinic: practical considerations for use to improve efficiency and outcomes</article-title><source>Pediatr Neurol</source><year>2023</year><month>11</month><volume>148</volume><fpage>157</fpage><lpage>163</lpage><pub-id pub-id-type="doi">10.1016/j.pediatrneurol.2023.08.035</pub-id><pub-id pub-id-type="medline">37725885</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>JCL</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>MH</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>N</given-names> </name><etal/></person-group><article-title>A scoping review on generative AI and large language models in mitigating medication related harm</article-title><source>NPJ Digit Med</source><year>2025</year><month>03</month><day>28</day><volume>8</volume><issue>1</issue><fpage>182</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01565-7</pub-id><pub-id pub-id-type="medline">40155703</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>W&#x00F3;jcik</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rulkiewicz</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pruszczyk</surname><given-names>P</given-names> </name><name name-style="western"><surname>Lisik</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pobo&#x017C;y</surname><given-names>M</given-names> </name><name name-style="western"><surname>Domienik-Kar&#x0142;owicz</surname><given-names>J</given-names> </name></person-group><article-title>Reshaping medical education: performance of ChatGPT on a PES medical examination</article-title><source>Cardiol J</source><year>2024</year><volume>31</volume><issue>3</issue><fpage>442</fpage><lpage>450</lpage><pub-id pub-id-type="doi">10.5603/cj.97517</pub-id><pub-id pub-id-type="medline">37830257</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name><name name-style="western"><surname>Portugez</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gross</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Advantages and pitfalls in utilizing artificial intelligence for crafting medical examinations: a medical education pilot study with GPT-4</article-title><source>BMC Med Educ</source><year>2023</year><month>10</month><day>17</day><volume>23</volume><issue>1</issue><fpage>772</fpage><pub-id pub-id-type="doi">10.1186/s12909-023-04752-w</pub-id><pub-id pub-id-type="medline">37848913</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waisberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Masalkhi</surname><given-names>M</given-names> </name><etal/></person-group><article-title>GPT-4 and ophthalmology operative notes</article-title><source>Ann Biomed Eng</source><year>2023</year><month>11</month><volume>51</volume><issue>11</issue><fpage>2353</fpage><lpage>2355</lpage><pub-id pub-id-type="doi">10.1007/s10439-023-03263-5</pub-id><pub-id pub-id-type="medline">37266720</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sun</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>H</given-names> </name><name name-style="western"><surname>Kennedy</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Evaluating GPT4 on impressions generation in radiology reports</article-title><source>Radiology</source><year>2023</year><month>06</month><volume>307</volume><issue>5</issue><fpage>e231259</fpage><pub-id pub-id-type="doi">10.1148/radiol.231259</pub-id><pub-id pub-id-type="medline">37367439</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Zhou</surname><given-names>Z</given-names> </name></person-group><article-title>Evaluation of ChatGPT&#x2019;s capabilities in medical report generation</article-title><source>Cureus</source><year>2023</year><month>04</month><volume>15</volume><issue>4</issue><fpage>e37589</fpage><pub-id pub-id-type="doi">10.7759/cureus.37589</pub-id><pub-id pub-id-type="medline">37197105</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kanjee</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Crowe</surname><given-names>B</given-names> </name><name name-style="western"><surname>Rodman</surname><given-names>A</given-names> </name></person-group><article-title>Accuracy of a generative artificial intelligence model in a complex diagnostic challenge</article-title><source>JAMA</source><year>2023</year><month>07</month><day>3</day><volume>330</volume><issue>1</issue><fpage>78</fpage><lpage>80</lpage><pub-id pub-id-type="doi">10.1001/jama.2023.8288</pub-id><pub-id pub-id-type="medline">37318797</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Waisberg</surname><given-names>E</given-names> </name><name name-style="western"><surname>Ong</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zaman</surname><given-names>N</given-names> </name><etal/></person-group><article-title>GPT-4 for triaging ophthalmic symptoms</article-title><source>Eye (Lond)</source><year>2023</year><month>12</month><volume>37</volume><issue>18</issue><fpage>3874</fpage><lpage>3875</lpage><pub-id pub-id-type="doi">10.1038/s41433-023-02595-9</pub-id><pub-id pub-id-type="medline">37231187</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lim</surname><given-names>G</given-names> </name><name name-style="western"><surname>Elangovan</surname><given-names>K</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>L</given-names> </name></person-group><article-title>Vision language models in ophthalmology</article-title><source>Curr Opin Ophthalmol</source><year>2024</year><month>11</month><day>1</day><volume>35</volume><issue>6</issue><fpage>487</fpage><lpage>493</lpage><pub-id pub-id-type="doi">10.1097/ICU.0000000000001089</pub-id><pub-id pub-id-type="medline">39259649</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ghim</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Ahn</surname><given-names>S</given-names> </name></person-group><article-title>Transforming clinical trials: the emerging roles of large language models</article-title><source>Transl Clin Pharmacol</source><year>2023</year><month>09</month><volume>31</volume><issue>3</issue><fpage>131</fpage><lpage>138</lpage><pub-id pub-id-type="doi">10.12793/tcp.2023.31.e16</pub-id><pub-id pub-id-type="medline">37810626</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wong</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Scaling clinical trial matching using large language models: a case study in oncology</article-title><access-date>2025-08-25</access-date><conf-name>Machine Learning for Healthcare Conference</conf-name><conf-date>Aug 11-12, 2023</conf-date><conf-loc>Columbia University</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.mlr.press/v219/wong23a.html">https://proceedings.mlr.press/v219/wong23a.html</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Floudas</surname><given-names>CS</given-names> </name><etal/></person-group><article-title>Matching patients to clinical trials with large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jul 27, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2307.15051</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tayebi Arasteh</surname><given-names>S</given-names> </name><name name-style="western"><surname>Han</surname><given-names>T</given-names> </name><name name-style="western"><surname>Lotfinia</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Large language models streamline automated machine learning for clinical studies</article-title><source>Nat Commun</source><year>2024</year><month>02</month><day>21</day><volume>15</volume><issue>1</issue><fpage>1603</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-45879-8</pub-id><pub-id pub-id-type="medline">38383555</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hopewell</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schulz</surname><given-names>KF</given-names> </name><etal/></person-group><article-title>CONSORT 2010 explanation and elaboration: updated guidelines for reporting parallel group randomised trials</article-title><source>BMJ</source><year>2010</year><month>03</month><day>23</day><volume>340</volume><fpage>c869</fpage><pub-id pub-id-type="doi">10.1136/bmj.c869</pub-id><pub-id pub-id-type="medline">20332511</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Schulz</surname><given-names>KF</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name></person-group><article-title>CONSORT 2010 statement: updated guidelines for reporting parallel group randomised trials</article-title><source>J Pharmacol Pharmacother</source><year>2010</year><month>07</month><volume>1</volume><issue>2</issue><fpage>100</fpage><lpage>107</lpage><pub-id pub-id-type="doi">10.4103/0976-500X.72352</pub-id><pub-id pub-id-type="medline">21350618</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chan</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Tetzlaff</surname><given-names>JM</given-names> </name><name name-style="western"><surname>G&#x00F8;tzsche</surname><given-names>PC</given-names> </name><etal/></person-group><article-title>SPIRIT 2013 explanation and elaboration: guidance for protocols of clinical trials</article-title><source>BMJ</source><year>2013</year><month>01</month><day>8</day><volume>346</volume><fpage>e7586</fpage><pub-id pub-id-type="doi">10.1136/bmj.e7586</pub-id><pub-id pub-id-type="medline">23303884</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Loudon</surname><given-names>K</given-names> </name><name name-style="western"><surname>Treweek</surname><given-names>S</given-names> </name><name name-style="western"><surname>Sullivan</surname><given-names>F</given-names> </name><name name-style="western"><surname>Donnan</surname><given-names>P</given-names> </name><name name-style="western"><surname>Thorpe</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Zwarenstein</surname><given-names>M</given-names> </name></person-group><article-title>The PRECIS-2 tool: designing trials that are fit for purpose</article-title><source>BMJ</source><year>2015</year><month>05</month><day>8</day><volume>350</volume><fpage>h2147</fpage><pub-id pub-id-type="doi">10.1136/bmj.h2147</pub-id><pub-id pub-id-type="medline">25956159</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Papineni</surname><given-names>K</given-names> </name><name name-style="western"><surname>Roukos</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ward</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zhu</surname><given-names>W</given-names> </name></person-group><article-title>Bleu: a method for automatic evaluation of machine translation</article-title><conf-name>Proceedings of the 40th Annual Meeting on Association for Computational Linguistics</conf-name><conf-date>Jul 6, 2002</conf-date><conf-loc>Philadelphia, PA</conf-loc><pub-id pub-id-type="doi">10.3115/1073083.1073135</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Lin</surname><given-names>CY</given-names> </name></person-group><article-title>Rouge: a package for automatic evaluation of summaries</article-title><year>2004</year><access-date>2025-08-25</access-date><conf-name>In Proceedings of the Workshop on Text Summarization Branches Out</conf-name><conf-loc>Barcelona, Spain</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W04-1013/">https://aclanthology.org/W04-1013/</ext-link></comment></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Banerjee</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lavie</surname><given-names>A</given-names> </name></person-group><article-title>METEOR: an automatic metric for MT evaluation with improved correlation with human judgments</article-title><access-date>2025-08-25</access-date><conf-name>Proceedings of the ACL workshop on intrinsic and extrinsic evaluation measures for machine translation and/or summarization</conf-name><conf-date>Jun 2005</conf-date><conf-loc>Ann Arbor, MI</conf-loc><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/W05-0909/">https://aclanthology.org/W05-0909/</ext-link></comment></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="web"><article-title>Evaluating inclusion and exclusion criteria in clinical trials</article-title><year>2020</year><access-date>2025-08-25</access-date><publisher-name>U.S. Food and Drug Administration</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/media/134754/download">https://www.fda.gov/media/134754/download</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tay</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Bommasani</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Emergent abilities of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 26, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2206.07682</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Devlin</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chang</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Toutanova</surname><given-names>K</given-names> </name><etal/></person-group><article-title>Bert: pre-training of deep bidirectional transformers for language understanding</article-title><source>arXiv</source><comment>Preprint posted online on  May 24, 2019</comment><pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Ghiasvand</surname><given-names>O</given-names> </name><etal/></person-group><article-title>A comparative study of pre-trained language models for named entity recognition in clinical trial eligibility criteria from multiple corpora</article-title><source>BMC Med Inform Decis Mak</source><year>2022</year><month>09</month><day>6</day><volume>22</volume><issue>Suppl 3</issue><fpage>235</fpage><pub-id pub-id-type="doi">10.1186/s12911-022-01967-7</pub-id><pub-id pub-id-type="medline">36068551</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Datta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>K</given-names> </name><name name-style="western"><surname>Paek</surname><given-names>H</given-names> </name><etal/></person-group><article-title>AutoCriteria: a generalizable clinical trial eligibility criteria extraction system powered by large language models</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>01</month><day>18</day><volume>31</volume><issue>2</issue><fpage>375</fpage><lpage>385</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocad218</pub-id><pub-id pub-id-type="medline">37952206</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Jayaraj</surname><given-names>S</given-names> </name><name name-style="western"><surname>Ludmir</surname><given-names>E</given-names> </name><name name-style="western"><surname>Roberts</surname><given-names>K</given-names> </name></person-group><article-title>Text classification of cancer clinical trial eligibility criteria</article-title><source>AMIA Annu Symp Proc</source><year>2023</year><volume>2023</volume><fpage>1304</fpage><lpage>1313</lpage><pub-id pub-id-type="medline">38222417</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Methnani</surname><given-names>J</given-names> </name><name name-style="western"><surname>Latiri</surname><given-names>I</given-names> </name><name name-style="western"><surname>Dergaa</surname><given-names>I</given-names> </name><name name-style="western"><surname>Chamari</surname><given-names>K</given-names> </name><name name-style="western"><surname>Ben Saad</surname><given-names>H</given-names> </name></person-group><article-title>ChatGPT for sample-size calculation in sports medicine and exercise sciences: a cautionary note</article-title><source>Int J Sports Physiol Perform</source><year>2023</year><month>10</month><day>1</day><volume>18</volume><issue>10</issue><fpage>1219</fpage><lpage>1223</lpage><pub-id pub-id-type="doi">10.1123/ijspp.2023-0109</pub-id><pub-id pub-id-type="medline">37536678</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>S</given-names> </name><name name-style="western"><surname>&#x0160;uster</surname><given-names>S</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Verspoor</surname><given-names>K</given-names> </name></person-group><article-title>Predicting publication of clinical trials using structured and unstructured data: model development and validation study</article-title><source>J Med Internet Res</source><year>2022</year><month>12</month><day>23</day><volume>24</volume><issue>12</issue><fpage>e38859</fpage><pub-id pub-id-type="doi">10.2196/38859</pub-id><pub-id pub-id-type="medline">36563029</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>F</given-names> </name></person-group><article-title>Medical text classification based on the discriminative pre-training model and prompt-tuning</article-title><source>Digit Health</source><year>2023</year><volume>9</volume><fpage>20552076231193213</fpage><pub-id pub-id-type="doi">10.1177/20552076231193213</pub-id><pub-id pub-id-type="medline">37559830</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bodicoat</surname><given-names>DH</given-names> </name><name name-style="western"><surname>Routen</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Willis</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Promoting inclusion in clinical trials-a rapid review of the literature and recommendations for action</article-title><source>Trials</source><year>2021</year><month>12</month><day>4</day><volume>22</volume><issue>1</issue><fpage>880</fpage><pub-id pub-id-type="doi">10.1186/s13063-021-05849-7</pub-id><pub-id pub-id-type="medline">34863265</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ke</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lie</surname><given-names>SA</given-names> </name><etal/></person-group><article-title>Mitigating cognitive biases in clinical decision-making through multi-agent conversations using large language models: simulation study</article-title><source>J Med Internet Res</source><year>2024</year><month>11</month><day>19</day><volume>26</volume><fpage>e59439</fpage><pub-id pub-id-type="doi">10.2196/59439</pub-id><pub-id pub-id-type="medline">39561363</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supporting files on study design and evaluations.</p><media xlink:href="jmir_v27i1e67469_app1.pdf" xlink:title="PDF File, 1131 KB"/></supplementary-material></app-group></back></article>