<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e72412</article-id><article-id pub-id-type="doi">10.2196/72412</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Using Large Language Models to Assess the Consistency of Randomized Controlled Trials on AI Interventions With CONSORT-AI: Cross-Sectional Survey</article-title></title-group><contrib-group><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Luo</surname><given-names>Xufei</given-names></name><degrees>MPH, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Li</surname><given-names>Zeming</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff6">6</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author" equal-contrib="yes"><name name-style="western"><surname>Yang</surname><given-names>Zhenhua</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="fn" rid="equal-contrib1">*</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Bingyi</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ma</surname><given-names>Yanfang</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Chen</surname><given-names>Fengxian</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff9">9</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Wang</surname><given-names>Qi</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff10">10</xref><xref ref-type="aff" rid="aff11">11</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ge</surname><given-names>Long</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff12">12</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zou</surname><given-names>James</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff13">13</xref><xref ref-type="aff" rid="aff14">14</xref><xref ref-type="aff" rid="aff15">15</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Zhang</surname><given-names>Lu</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff6">6</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Chen</surname><given-names>Yaolong</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref><xref ref-type="aff" rid="aff4">4</xref><xref ref-type="aff" rid="aff5">5</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Bian</surname><given-names>Zhaoxiang</given-names></name><degrees>MD, PhD</degrees><xref ref-type="aff" rid="aff7">7</xref><xref ref-type="aff" rid="aff8">8</xref></contrib></contrib-group><aff id="aff1"><institution>Evidence-Based Medicine Center, School of Basic Medical Sciences, Lanzhou University</institution><addr-line>199 Donggang West Road, Chengguan District</addr-line><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff2"><institution>Research Unit of Evidence-Based Evaluation and Guidelines, Chinese Academy of Medical Sciences (2021RU017), School of Basic Medical Sciences, Lanzhou University</institution><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff3"><institution>World Health Organization Collaboration Center for Guideline Implementation and Knowledge Translation</institution><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff4"><institution>Institute of Health Data Science, Lanzhou University</institution><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff5"><institution>Key Laboratory of Evidence-Based Medicine of Gansu Province, Lanzhou University</institution><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff6"><institution>Department of Computer Science, Hong Kong Baptist University</institution><addr-line>Hong Kong</addr-line><country>China (Hong Kong)</country></aff><aff id="aff7"><institution>Vincent V.C. Woo Chinese Medicine Clinical Research Institute, School of Chinese Medicine, Hong Kong Baptist University</institution><addr-line>Hong Kong</addr-line><country>China (Hong Kong)</country></aff><aff id="aff8"><institution>Chinese EQUATOR Centre</institution><addr-line>Hong Kong</addr-line><country>China (Hong Kong)</country></aff><aff id="aff9"><institution>School of Information Science &#x0026; Engineering, Lanzhou University</institution><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff10"><institution>School of Nursing, Li Ka Shing Faculty of Medicine, University of Hong Kong</institution><addr-line>Hong Kong</addr-line><country>China (Hong Kong)</country></aff><aff id="aff11"><institution>Department of Health Research Methods, Evidence and Impact, Faculty of Health Sciences, McMaster University</institution><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff12"><institution>Department of Health Policy and Management, School of Public Health, Lanzhou University</institution><addr-line>Lanzhou</addr-line><country>China</country></aff><aff id="aff13"><institution>Department of Biomedical Data Science, Stanford University</institution><addr-line>Stanford</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff14"><institution>Department of Electrical Engineering, Stanford University</institution><addr-line>Stanford</addr-line><addr-line>CA</addr-line><country>United States</country></aff><aff id="aff15"><institution>Department of Computer Science, Stanford University</institution><addr-line>Stanford</addr-line><addr-line>CA</addr-line><country>United States</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Hang</surname><given-names>Ching Nam</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Menke</surname><given-names>Joe D</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Vengadassalapathy</surname><given-names>Srinivasan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Yaolong Chen, MD, PhD, Evidence-Based Medicine Center, School of Basic Medical Sciences, Lanzhou University, 199 Donggang West Road, Chengguan District, Lanzhou, 730000, China, 86 13893104140; <email>chevidence@lzu.edu.cn</email></corresp><fn fn-type="equal" id="equal-contrib1"><label>*</label><p>these authors contributed equally</p></fn></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>26</day><month>9</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e72412</elocation-id><history><date date-type="received"><day>09</day><month>02</month><year>2025</year></date><date date-type="rev-recd"><day>20</day><month>08</month><year>2025</year></date><date date-type="accepted"><day>21</day><month>08</month><year>2025</year></date></history><copyright-statement>&#x00A9; Xufei Luo, Zeming Li, Zhenhua Yang, Bingyi Wang, Yanfang Ma, Fengxian Chen, Qi Wang, Long Ge, James Zou, Lu Zhang, Yaolong Chen, Zhaoxiang Bian. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 26.9.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e72412"/><abstract><sec><title>Background</title><p>Chatbots based on large language models (LLMs) have shown promise in evaluating the consistency of research. Previously, researchers used LLM to assess if randomized controlled trial (RCT) abstracts adhered to the CONSORT-Abstract guidelines. However, the consistency of artificial intelligence (AI) interventional RCTs aligning with the CONSORT-AI (Consolidated Standards of Reporting Trials-Artificial Intelligence) standards by LLMs remains unclear.</p></sec><sec><title>Objective</title><p>The aim of this study is to identify the consistency of RCTs on AI interventions with CONSORT-AI using chatbots based on LLMs.</p></sec><sec sec-type="methods"><title>Methods</title><p>This cross-sectional study employed 6 LLM models to assess the consistency of RCTs on AI interventions. The sample selection is based on articles published in <italic>JAMA Network Open</italic>, which included a total of 41 RCTs. All queries were submitted to LLMs through an application programming interface with a temperature setting of 0 to ensure deterministic responses. One researcher posed the questions to each model, while another independently verified the responses for validity before recording the results. The Overall Consistency Score (OCS), recall, inter-rater reliability, and consistency of contents were analyzed.</p></sec><sec sec-type="results"><title>Results</title><p>We found gpt-4&#x2010;0125-preview has the best average OCS on the basis of the results obtained by <italic>JAMA Network Open</italic> authors and by us (86.5%, 95% CI 82.5%&#x2010;90.5% and 81.6%, 95% CI 77.6%&#x2010;85.6%, respectively), followed by gpt-4&#x2010;1106-preview (80.3%, 95% CI 76.3%&#x2010;84.3% and 78.0%, 95% CI 74.0%&#x2010;82.0%, respectively). The model with the worst average OCS is gpt-3.5-turbo-0125 on the basis of the results obtained by <italic>JAMA Network Open</italic> authors and by us (61.9%, 95% CI 57.9%&#x2010;65.9% and 63.0%, 95% CI 59.0%&#x2010;67.0%, respectively). Among the 11 unique items of CONSORT-AI, Item 2 (&#x201C;State the inclusion and exclusion criteria at the level of the input data&#x201D;) received the poorest overall evaluation across the 6 models, with an average OCS of 48.8%. For other items, those with an average OCS greater than 80% across the 6 models included Items 1, 5, 8, and 9.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>GPT-4 variants demonstrate strong performance in assessing the consistency of RCTs with CONSORT-AI. Nonetheless, refining the prompts could enhance the precision and consistency of the outcomes. While AI tools like GPT-4 variants are valuable, they are not yet fully autonomous in addressing complex and nuanced tasks such as adherence to CONSORT-AI standards. Therefore, integrating AI with higher levels of human supervision and expertise will be crucial to ensuring more reliable and efficient evaluations, ultimately advancing the quality of medical research.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>ChatGPT</kwd><kwd>CONSORT-AI</kwd><kwd>large language model</kwd><kwd>randomized controlled trials</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>Transparent and standardized reporting of medical research is crucial for enhancing the quality and scientific integrity of studies [<xref ref-type="bibr" rid="ref1">1</xref>]. Authors routinely adhere to reporting guidelines when drafting their articles to ensure comprehensive reporting. Widely adopted reporting standards encompass the Consolidated Standards of Reporting Trials (CONSORT) for randomized trials [<xref ref-type="bibr" rid="ref2">2</xref>], Preferred Reporting Items for Systematic reviews and Meta-Analyses (PRISMA) for systematic reviews and meta-analyses [<xref ref-type="bibr" rid="ref3">3</xref>], STrengthening the Reporting of OBservational studies in Epidemiology (STROBE) for observational studies [<xref ref-type="bibr" rid="ref4">4</xref>], among others. These guidelines not only guide authors on how to comprehensively report the content of their studies but also serve as benchmarks for evaluating whether the reporting of published articles is complete and adheres to established guidelines. While these evaluations are relatively objective, the process can be quite time-intensive and lacks scalability.</p><p>The CONSORT-AI (Consolidated Standards of Reporting Trials-Artificial Intelligence) [<xref ref-type="bibr" rid="ref5">5</xref>], introduced in 2020, aims to enhance transparency and thoroughness in the documentation of clinical trials involving AI interventions. Building upon the original CONSORT 2010 statement [<xref ref-type="bibr" rid="ref2">2</xref>], this guideline extends the standards by mandating the routine inclusion of 11 additional items deemed critical for AI-based interventions. These additions emphasize the need for detailed reporting on AI intervention itself, including user guidelines, required competencies, the context of its application, and the management of its inputs and outputs. Moreover, CONSORT-AI underscores the importance of documenting the interaction between human users and the AI system, as well as a comprehensive analysis of instances where AI fails or errs. This level of detail is crucial for ensuring a clear understanding and evaluation of how the AI intervention is integrated and performs within a clinical setting. We focused on CONSORT-AI because it is the most widely recognized and accepted guideline for reporting AI-based randomized controlled trials (RCTs). Other guidelines were considered but excluded due to their lack of specificity or comprehensiveness for AI interventions. While our results may generalize to other frameworks, further research is needed to confirm this.</p><p>Large language models (LLMs) are advanced AI systems trained on massive corpus data, allowing them to understand and generate human-like text for a wide range of applications, from question-answering and content generation to language translation and evaluation [<xref ref-type="bibr" rid="ref6">6</xref>]. LLMs have been increasingly applied in various fields beyond health care, such as education and social services, demonstrating their versatility and potential for automating complex tasks. For example, recent studies have explored the use of LLMs in educational settings to assist with personalized learning [<xref ref-type="bibr" rid="ref7">7</xref>] and in social services to improve decision-making processes [<xref ref-type="bibr" rid="ref8">8</xref>]. These applications highlight the broader relevance of AI-driven evaluations and the potential for LLMs to enhance efficiency and accuracy across multiple domains. Previous studies have suggested that LLMs or similar conversational AI models demonstrate promising performance in evaluating the risk of bias and quality of reporting [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>]. However, no study has specifically examined the utility of employing LLMs to assess the adherence of RCTs involving AI interventions to the CONSORT-AI reporting guidelines. Given this gap, we conducted this study to investigate the performance of LLMs (GPT and Claude) [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>] in evaluating the reporting quality of AI-related RCTs against the CONSORT-AI criteria. To our knowledge, this is the first study to systematically evaluate the performance of multiple LLMs in assessing the consistency of AI-based RCTs with the CONSORT-AI guidelines. By doing so, we provide a benchmark for the use of LLMs in this context and highlight their potential for automating the evaluation of reporting quality, which could significantly reduce the time and resource burden on human reviewers. Compared to existing tools like CONSORT-NLP, which is rule-based and effective for specific items but limited in adaptability to new guidelines or nuanced AI contexts [<xref ref-type="bibr" rid="ref15">15</xref>], LLMs offer greater flexibility and scalability without requiring extensive retraining. Manual adherence checks are time-intensive, often taking hours per paper [<xref ref-type="bibr" rid="ref10">10</xref>], highlighting the need for automated solutions like LLMs to alleviate this burden in real-world research evaluation.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Overview</title><p>In this study, we used a previously published systematic review as the benchmark for comparing the performance of LLMs [<xref ref-type="bibr" rid="ref16">16</xref>]. The selected review evaluated 41 unique RCTs involving AI interventions against 11 key items from the CONSORT-AI reporting guidelines (<xref ref-type="other" rid="box1">Textbox 1</xref>). We selected this review because, firstly, it includes 41 RCTs of AI interventions, which is a relatively large sample size for this emerging field. Secondly, this review provides the evaluation results of each study from the CONSORT-AI guidelines, which serves as a convenient gold standard for evaluating the performance of LLMs. We did not prospectively register this study because it is a survey on LLMs, and whether it is registered or not does not affect the quality of the study or introduce bias. However, to enhance transparency, we have outlined a retrospective protocol including prompt refinement criteria, evaluation thresholds, and adjudication processes (see <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for details). We followed the Strengthening The Reporting Of Cohort Studies in Surgery (STROCSS) guidelines in writing and reporting this study [<xref ref-type="bibr" rid="ref17">17</xref>]. For inter-rater reliability, one researcher posed the questions to each model, while another independently verified the responses for validity before recording the results; a third annotator reviewed 20% of the outputs for consistency, yielding a Cohen &#x03BA; of 0.92.</p><boxed-text id="box1"><title> CONSORT-AI (Consolidated Standards of Reporting Trials-Artificial Intelligence) unique checklist.</title><p>1. Explain the intended use of the artificial intelligence (AI) intervention in the context of the clinical pathway, including its purpose and its intended users (eg, healthcare professionals, patients, public).</p><p>2. State the inclusion and exclusion criteria at the level of the input data.</p><p>3. Describe how the AI intervention was integrated into the trial setting, including any onsite or offsite requirements. Describe how the AI intervention was integrated into the trial setting, including any onsite or offsite requirements.</p><p>4. State which version of the AI algorithm was used. State which version of the AI algorithm was used.</p><p>5. Describe how the input data were acquired and selected for the AI intervention. Describe how the input data were acquired and selected for the AI intervention.</p><p>6. Describe how poor quality or unavailable input data were assessed and handled. Describe how poor quality or unavailable input data were assessed and handled.</p><p>7. Specify whether there was human-AI interaction in the handling of the input data, and what level of expertise was required of users.</p><p>8. Specify the output of the AI intervention. Specify the output of the AI intervention.</p><p>9. Explain how the AI intervention&#x2019;s outputs contributed to decision-making or other elements of clinical practice.</p><p>10. Describe results of any analysis of performance errors and how errors were identified, where applicable. If such analysis was planned or done, justify why.</p><p>11. State whether and how the AI intervention and its code can be accessed, including any restrictions to access or re-use.</p></boxed-text></sec><sec id="s2-2"><title>Large Language Model Chatbots</title><p>We employed GPT-4 variants (gpt-4&#x2010;0125-preview and gpt-4&#x2010;1106-preview), ChatGPT 3.5 (gpt-3.5-turbo-0125 and gpt-3.5-turbo-1106), Claude-3-Opus-20240229, and Claude-3-sonnet-20240229 to evaluate the adherence of RCTs on AI intervention. We selected ChatGPT and Claude models due to their established performance in natural language understanding and generation tasks, particularly in complex domains such as medical research. These models have been widely used and validated in similar evaluative contexts, making them suitable for our study. While other models such as Gemini and LLaMA are also prominent, their inclusion would have increased the complexity of the study without necessarily providing additional insights, given the rapid evolution of LLMs and the focus of our research on well-established models [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>].</p></sec><sec id="s2-3"><title>Prompt Engineering</title><p>Prompt engineering is crucial for generating accurate and concise outputs from LLMs by crafting well-designed instructions (prompts). We initiated the process by drafting an initial prompt based on ChatGPT&#x2019;s prompt engineering guidelines [<xref ref-type="bibr" rid="ref20">20</xref>], encompassing the LLM&#x2019;s role, the task description, response rules, example responses, question descriptions, and the questions themselves. We then repeatedly tested and refined the prompt using GPT-4 variants on five RCTs until the results achieved an average accuracy rate of 85% or higher. The 5 articles used for prompt refinement were excluded from the final evaluation to prevent data leakage and ensure the validity of our results. After multiple rounds of testing, we finalized the prompt for ChatGPT. For example, the prompt template for ChatGPT was as follows: &#x201C;You are an expert in evaluating the reporting quality of RCTs involving AI interventions. Based on the CONSORT-AI guidelines, assess whether the following RCT adheres to the specified items...&#x201D;. Subsequently, we adapted this prompt for the Claude model with minor adjustments tailored to its architecture. <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref> presents the final prompts that we used to communicate with ChatGPT and Claude.</p></sec><sec id="s2-4"><title>The Process of Evaluation</title><p>To facilitate the identification and evaluation of relevant information, we converted the 41 PDF documents into editable Word format using online conversion software Smallpdf . To save tokens, author names, institutions, and references were removed to reduce token usage and focus the LLMs on the core content of the RCTs. While this may have removed some contextual clues, we believe it did not significantly impact the assessment of CONSORT-AI items, as the key information is contained within the main text. No additional structured format was used, and truncation was not required as the input lengths were within the token limits of the models. All queries for each LLM were submitted through an application programming interface with a temperature setting of 0 to ensure deterministic responses. This setting minimizes variability in the output, which is crucial for maintaining consistency and accuracy in the evaluation of RCTs. Although the temperature was set to 0 to minimize variability, LLMs can still exhibit non-deterministic behavior due to factors such as hardware differences or model updates. One researcher posed the questions to each model, while another independently verified the responses for validity before recording the results. Verification involved ensuring that the LLM&#x2019;s response was logically consistent and directly related to the input text. No responses were found to be invalid during this process. Each article was queried only once to maintain consistency. All results were documented in Excel for subsequent analysis. The entire querying process was conducted between April 8 and April 10, 2024. The process of the study is demonstrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Flowchart of the study. OCS: Overall Consistency Score.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72412_fig01.png"/></fig></sec><sec id="s2-5"><title>Data Analysis</title><p>Following the methodology we established, each constituent subgroup was subsequently scored and categorized into one of the two classifications (reported and not reported). We quantified consistency through an OCS, which reflects the proportion of criteria met, calculated as follows: OCS=(Number of items consistent with gold standards/11) &#x00D7; 100%. This yielded both an absolute score out of 11 (number of total items) and a percentage. The meaning of the OCS score is the percentage of items or studies where the results generated by the LLM match the gold standard we set. The OCS is a measure of accuracy, calculated as the proportion of items where the LLM&#x2019;s assessment matches the gold standard. Content consistency, on the other hand, refers to the agreement between the specific excerpts extracted by the LLMs and those identified by human annotators. To further assess the quality of the data, we computed the Recall for each model, defined as the ratio of true positives&#x2014;items correctly identified by the LLM from the full-text publication&#x2014;to the sum of true positives and false positives. False positives represent instances where the LLMs generated data without any corresponding information in the full text (ie, &#x201C;hallucinated data&#x201D;) [<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>For inter-rater reliability, Cohen &#x03BA; was employed to gauge the agreement between human responses and those generated by LLMs. The &#x03BA; coefficient interpretation ranges as follows: 0.0-0.20 for slight agreement, 0.21-0.40 for fair agreement, 0.41-0.60 for moderate agreement, 0.61-0.80 for substantial agreement, and 0.81-1.0 for almost perfect agreement [<xref ref-type="bibr" rid="ref22">22</xref>]. Meanwhile, to detect the accuracy of the LLM responses, we analyzed the consistency between the content extracted by GPT-4 variants and the content we extracted ourselves. The content we extracted was also used as one of the gold standards and compared with the standard from <italic>JAMA Network Open</italic>. Content consistency is defined as: (number of consistent items between the two/total number of items) &#x00D7; 100%. All statistical analyses were conducted using R (developed by Posit, version 2023.12.1 Build 402).</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Average Consistency Scores of Different Models</title><p>The results from six different models revealed that GPT-4&#x2010;0125-preview had the highest average OCS, regardless of whether <italic>JAMA Network Open</italic> authors were used as the gold standard or our own evaluations were applied. Specifically, GPT-4&#x2010;0125-preview achieved an average OCS of 86.5% (95% CI 82.5%&#x2010;90.5%) according to <italic>JAMA Network Open</italic> authors, and 81.6% (95% CI 77.6%&#x2010;85.6%) based on our evaluations. Following this, GPT-4&#x2010;1106-preview had an average OCS of 80.3% (95% CI 76.3%&#x2010;84.3%) with <italic>JAMA Network Open</italic> authors as the gold standard, and 78.0% (95% CI 74.0%&#x2010;82.0%) based on our evaluations. The model with the worst average OCS is gpt-3.5-turbo-0125 (61.9%, 95% CI 57.9%&#x2010;65.9% and 63.0%, 95% CI 59.0%&#x2010;67.0%, respectively, as obtained by <italic>JAMA Network Open</italic> authors and us). ANOVA testing showed significant differences in OCS across models (<italic>F</italic>=21.48, <italic>P&#x003C;</italic>.001), with post hoc Tukey HSD confirming GPT-4 variants outperformed Claude and GPT-3.5 (<italic>P&#x003C;</italic>.05 for all pairwise comparisons). Specific average OCS values have been included in <xref ref-type="fig" rid="figure2">Figure 2</xref>. The detailed data are shown in <xref ref-type="supplementary-material" rid="app3">Multimedia Appendix 3</xref>.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>The average consistency score of different models. The green bars represent the evaluation results using <italic>JAMA Network Open</italic> authors as the gold standard, while the purple bars represent the results evaluated by us. The error bars represent the 95% CI.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72412_fig02.png"/></fig></sec><sec id="s3-2"><title>Overall Consistency Score for Each Study</title><p>In the 41 RCTs included, the median OCS assessed by the gpt-4&#x2010;0125-preview was 81.8%, ranging from 54.5% to 100%. gpt-4&#x2010;1106-preview, claude-3-Opus-20240229, claude-3-sonnet-20240229, gpt-3.5-turbo-0125, and gpt-3.5-turbo-1106 were 72.7% (54.5%-100%), 81.8% (45.5%-100%), 81.8% (54.5%-100%), 63.6% (36.4%-81.8%), and 63.6% (36.4%-90.9%), respectively. The results are presented as a histogram in <xref ref-type="fig" rid="figure3">Figure 3</xref>. The specific results are shown in <xref ref-type="supplementary-material" rid="app4">Multimedia Appendix 4</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Histograms of Overall Consistency Score (OCS) scores from 41 randomized controlled trials using different models, with the <italic>X</italic>-axis representing OCS scores and the Y-axis representing frequencies or amount of study.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72412_fig03.png"/></fig></sec><sec id="s3-3"><title>Overall Consistency Score for Each Item</title><p>Among the 11 unique items of CONSORT-AI, Item 2 (State the inclusion and exclusion criteria at the level of the input data) received the poorest overall evaluation across the 6 models, with an average OCS of 48.8%. For other items, those with an average OCS greater than 80% across the 6 models included Items 1, 5, 8, and 9, as detailed in <xref ref-type="fig" rid="figure4">Figure 4</xref>.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>The overall consistency score for each item. The error bars represent the standard error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72412_fig04.png"/></fig></sec><sec id="s3-4"><title>Recall for Each Item</title><p>The recall for each item varied. In CONSORT-AI, Items 1, 2, 3, 5, 7, and 8 show consistently high recall across the 6 different models, but the remaining 5 items showed significant variability in performance among different models, with overall lower recall, as shown in <xref ref-type="fig" rid="figure5">Figure 5</xref>.</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>The recall for each item. The error bars represent the standard error.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e72412_fig05.png"/></fig></sec><sec id="s3-5"><title>Consistency of Content Generated by GPT-4 Variants Compared to Humans</title><p>We reevaluated the included 41 AI intervention RCTs. We found inconsistencies between the content extracted by GPT-4&#x2010;0125-preview and our own extracted content. Consistency was defined as an exact match between the excerpt identified by the LLM and that identified by human annotators. No hallucinated quotes were observed in our analysis. In 41 studies, 29 (70.7%) studies showed inconsistencies; Item 2 had inconsistencies in 15 studies (36.6%), and Item 9 had inconsistencies in 11 studies (26.8%), as detailed in <xref ref-type="supplementary-material" rid="app5">Multimedia Appendix 5</xref>. The agreement between our annotations and those from the systematic review was moderate, with a Cohen &#x03BA; of 0.65, indicating that human evaluations also vary and providing context for the performance of the LLMs. Cohen &#x03BA; values for comparisons between different models and the gold standard are presented in <xref ref-type="supplementary-material" rid="app6">Multimedia Appendix 6</xref>.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>We employed 3 different LLMs (ChatGPT4, ChatGPT 3.5, and Claude 3) from 6 different versions to evaluate the consistency of 41 AI intervention RCTs with CONSORT-AI. We found GPT-4 variants (gpt-4&#x2010;0125-preview) outperformed the other models, with significant variation in average OCS across different items, ranging from 26.8% to 100%, with an average score of 86.5%. The potential reason might be due to the varying levels of difficulty among different items, leading to different levels of understanding by the LLM tools. There is currently no recognized standard for what threshold indicates high consistency, but based on our experience, exceeding 85% suggests very good consistency. In addition, compared to the standard from <italic>JAMA Network Open</italic>, our own evaluation as the gold standard resulted in higher OCS scores in both GPT-4 variants and Claude. This suggests that there may be some variability in evaluation results between different individuals for the same item.</p><p>The highest consistency by GPT-4 variants (gpt-4&#x2010;0125-preview) could be attributed to its more refined understanding and processing capabilities, which are crucial in analyzing complex scientific texts. However, in some items (such as Items 2, 4, and 10) of the CONSORT-AI, all LLM chatbots perform poorly, possibly because these items are relatively complex. For example, in Item 2, GPT-4 misinterpreted data-level criteria as patient-level in one RCT we included, extracting &#x201C;patients aged 18&#x2010;65&#x201D; instead of &#x201C;images with resolution &#x003E;512 px,&#x201D; likely due to prompt ambiguity around &#x201C;input data.&#x201D; Token limits were not an issue (average input &#x003C;10k tokens), but model behavior suggests overfitting to common clinical phrases. The main reason might be that LLMs cannot effectively distinguish what the criteria are at the data level. Another example is Item 4, which requires reporting the version of the AI algorithm. However, many included studies report the time of algorithm development rather than the specific version, which we consider less precise as researchers may not always use the latest version. LLMs, however, cannot recognize this type of response as a report. Despite multiple adjustments and optimizations of the prompts, satisfactory results have not been achieved, indicating the need for future exploration of combining human annotation with LLMs to improve overall evaluation effectiveness. Another potential solution to this issue is to remove poorly performing items during checklist testing [<xref ref-type="bibr" rid="ref23">23</xref>].</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>Compared to previous studies, which lacked specific focus on LLMs for evaluating AI RCT consistency with CONSORT-AI, our study provides a novel application of LLMs in this context. A similar study [<xref ref-type="bibr" rid="ref15">15</xref>] conducted in 2020 developed the CONSORT-NLP tool, which used natural language processing methods to generate CONSORT checklists. The study indicated that the accuracy of 28 out of the 37 CONSORT items could reach over 90%. However, the CONSORT-NLP tool is rule-based and does not require retraining for different checklists, but it is limited to specific CONSORT items and may not be easily adaptable to other guidelines. Another study [<xref ref-type="bibr" rid="ref10">10</xref>] used ChatGPT 3.5 to evaluate the adherence of RCT abstracts to the CONSORT-Abstract guidelines. The results demonstrated that ChatGPT can automate the appraisal of medical literature, facilitating the identification of accurately reported research. Additionally, our findings are consistent with those of Woelfle et al [<xref ref-type="bibr" rid="ref24">24</xref>], Jiang et al [<xref ref-type="bibr" rid="ref25">25</xref>], and Jiang et al [<xref ref-type="bibr" rid="ref26">26</xref>], who also explored the use of AI in evaluating reporting quality. In contrast to the previous positive results, a study [<xref ref-type="bibr" rid="ref9">9</xref>] using the RoB 2 tool to evaluate the risk of bias in RCTs indicated that ChatGPT and systematic reviewers only have &#x201C;slight&#x201D; to &#x201C;fair&#x201D; agreement in risk of bias judgments for randomized trials. Currently, ChatGPT is unable to reliably assess the risk of bias in randomized trials. More recent work by Changkai et al [<xref ref-type="bibr" rid="ref27">27</xref>] on LLM assessment of RoB 2 further supports the potential of LLMs in evaluating research quality. Therefore, the evaluation results differ when using different models for different checklists. It is necessary to optimize and refine the prompts to achieve the best consistency with reporting guidelines for different types of study when using different checklists. The agreement between human annotators was moderate (Cohen &#x03BA;=0.65), which is consistent with the findings from Woelfle et al [<xref ref-type="bibr" rid="ref24">24</xref>] and provides context for interpreting LLM&#x2019;s performance. Additionally, LLMs are known to generate hallucinated content, which can affect the reliability of their evaluations. To mitigate this, we set the temperature to 0 for deterministic responses and manually verified the responses for validity. Additionally, we calculated recall to assess the extent of hallucinated data, which were found to be minimal in our study.</p></sec><sec id="s4-3"><title>Future Directions</title><p>These findings underscore the need for continuous improvement in the reporting practices of AI intervention RCTs. LLMs like GPT-4 variants can play an important role in automating the evaluation of such checklists, potentially reducing the time and resource burden on human reviewers. The next step will be to optimize and refine the prompts for different reporting checklists recommended by the Enhancing the QUAlity and Transparency Of health Research (EQUATOR) Network based on various LLMs. This will be done to improve authors&#x2019; adherence to reporting guidelines and assist reviewers and journal editors in efficiently and quickly evaluating the completeness of manuscripts. To achieve this goal, future studies should focus on developing prompt engineering for different reporting guidelines and LLMs, performing testing and validation to achieve the best performance and outcomes. Given the large number of reporting checklists and the rapid evolution of LLMs, manual prompt refinement may be inefficient. Future research should explore automated or semiautomated methods for generating effective prompts.</p><p>Additionally, our study suggests that when using LLMs to evaluate the CONSORT-AI consistency of AI intervention RCTs, the results for Items 1, 5, 8, and 9 are reliable and recommended for use. However, the results for Item 2 are not reliable, and it is not recommended to use LLMs for this evaluation. The performance of other items may vary depending on different studies. For other types of RCTs, using LLMs to evaluate consistency with CONSORT or its extensions still requires further studies in the future.</p></sec><sec id="s4-4"><title>Strengths and Limitations</title><p>A key strength of this study is the comprehensive use of multiple closed-source LLM chatbots to evaluate a large number of RCTs, providing robust insights into the state of reporting consistency in the field of AI interventions. However, the study has some limitations. We found some inconsistencies between the results evaluated by the original authors and those evaluated by the LLMs. Therefore, we extracted specific contents from 41 RCTs, and the results indicated that some content was incorrectly evaluated by the authors, leading to these inconsistencies. We analyzed the consistency of the content, and it does not affect the overall conclusion. Moreover, the low adherence to certain items is mainly related to the difficulty in understanding the prompts and items, which requires continuous optimization and adjustment. Additionally, we recognize that these models have specific limitations compared to other approaches. For instance, although LLMs are valuable for initial content generation and consistency checks, they may lack the specialized decision-making capabilities or real-time adaptability that other models, such as those used in emergency triage settings[<xref ref-type="bibr" rid="ref19">19</xref>], can offer. It will still be necessary to strengthen human supervision and enhance the capabilities of these models to better serve related medical tasks in the future. Additionally, considering the instability of the outputs of LLMs, we used only the results from the first output as the final results but conducted repeated runs on 10 RCTs, finding no variability in outputs due to the temperature setting of 0. Due to the rapid updates of LLMs, it is essential to update the models we use and the prompts in a timely manner to better suit the users. It is worth noting that this study used plain text input from Word documents. If users use PDF format documents for evaluation, the performance may be worse than the results presented in this study. Additionally, we acknowledge that our study relies on specific versions of LLMs (eg, ChatGPT and Claude), and this focus inherently limits the generalizability of the findings to other platforms and newer AI models. This limitation reflects the rapidly evolving nature of LLMs and the challenges of conducting such research within this dynamic landscape. Additionally, the study was not registered, which may limit transparency and reproducibility. Although registration is more common for clinical or observational studies, future research in this area could benefit from registering protocols to enhance methodological rigor. Moreover, although the studies were published before GPT-4&#x2019;s knowledge cut-off, minimizing the risk of data leakage, future studies could benefit from using open-source models to further ensure no prior exposure to the data. Lastly, although 41 RCTs represent a relatively large sample for AI-based interventions, the small sample size may limit the generalizability of our findings. Future studies with larger datasets are needed to confirm our results.</p><p>In conclusion, while the use of LLMs demonstrates significant potential for helping in the consistency evaluation of AI intervention RCTs, there is still considerable room for improvement in both the tools and the reporting standards they are designed to assess. Current models have limitations, necessitating transparent versioning, bias detection mechanisms, and robust human oversight. Regulatory safeguards, such as audit trails and standards for AI in research assessment, are essential to prevent premature reliance. This study contributes to the ongoing dialog about the role of AI in enhancing the transparency and reliability of scientific reporting in health care.</p></sec></sec></body><back><ack><p>We would like to thank all the members of the ADVANCED (AI-Driven deVelopment and AssessmeNt of healthCare guidElines and standarDs) group for their assistance in extracting data for this study: Hui Liu (Lanzhou University); Ye Wang (Lanzhou University); Haodong Li (Lanzhou University); Huayu Zhang (Lanzhou University); Di Zhu (Lanzhou University); Yuanyuan Yao (Lanzhou University); Dongrui Peng (Lanzhou University); Honghao Lai (Lanzhou University); Jie Zhang (Lanzhou University); Fan Wang (Children&#x2019;s Hospital of Chongqing Medical University); Minjie Duan (Children&#x2019;s Hospital of Chongqing Medical University); Yueyan Li (Children&#x2019;s Hospital of Chongqing Medical University); Shilin Tang (Children&#x2019;s Hospital of Chongqing Medical University); Hanxiang Liu (Children&#x2019;s Hospital of Chongqing Medical University).</p><p>The project was supported by the Vincent and Lily Woo Foundation and Research Unit of Evidence-Based Evaluation and Guidelines, Chinese Academy of Medical Sciences (2021RU017), School of Basic Medical Sciences, Lanzhou University.</p></ack><fn-group><fn fn-type="con"><p>Conceptualization: JZ, LZ, YC, ZB</p><p>Data curation: BW, FC, LG, QW, XL, YM, ZL, ZY</p><p>Formal analysis: XL, ZL, ZY</p><p>Investigation: BW, FC, LG, QW, XL, YM, ZL, ZY</p><p>Methodology: JZ, LZ, YC, ZB</p><p>Supervision: JZ, LZ, YC, ZB</p><p>Visualization: XL, ZL, ZY</p><p>Writing &#x2013; original draft: XL, ZL, ZY</p><p>Writing &#x2013; review &#x0026; editing: JZ, LZ, XL, YC, ZB, ZL, ZY</p><p>JZ (email: jamesz@stanford.edu), YC (email:evidence@lzu.edu.c) and LZ (email: ericluzhang@hkbu.edu.hk) are co-corresponding authors.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">CONSORT</term><def><p>Consolidated Standards of Reporting Trials</p></def></def-item><def-item><term id="abb3">CONSORT-AI</term><def><p>Consolidated Standards of Reporting Trials-Artificial Intelligence</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">OCS</term><def><p>Overall Consistency Score</p></def></def-item><def-item><term id="abb6">PRISMA</term><def><p>Systematic reviews and Meta-Analyses</p></def></def-item><def-item><term id="abb7">RCT</term><def><p>randomized controlled trial</p></def></def-item><def-item><term id="abb8">STROBE</term><def><p>Strengthening the Reporting of Observational studies in Epidemiology</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Simera</surname><given-names>I</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hirst</surname><given-names>A</given-names> </name><name name-style="western"><surname>Hoey</surname><given-names>J</given-names> </name><name name-style="western"><surname>Schulz</surname><given-names>KF</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name></person-group><article-title>Transparent and accurate reporting increases reliability, utility, and impact of your research: reporting guidelines and the EQUATOR Network</article-title><source>BMC Med</source><year>2010</year><month>04</month><day>26</day><volume>8</volume><fpage>24</fpage><pub-id pub-id-type="doi">10.1186/1741-7015-8-24</pub-id><pub-id pub-id-type="medline">20420659</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Hopewell</surname><given-names>S</given-names> </name><name name-style="western"><surname>Schulz</surname><given-names>KF</given-names> </name><etal/></person-group><article-title>CONSORT 2010 explanation and elaboration: updated guidelines for reporting parallel group randomised trials</article-title><source>BMJ</source><year>2010</year><month>03</month><day>23</day><volume>340</volume><fpage>c869</fpage><pub-id pub-id-type="doi">10.1136/bmj.c869</pub-id><pub-id pub-id-type="medline">20332511</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Page</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>McKenzie</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Bossuyt</surname><given-names>PM</given-names> </name><etal/></person-group><article-title>The PRISMA 2020 statement: an updated guideline for reporting systematic reviews</article-title><source>BMJ</source><year>2021</year><month>03</month><day>29</day><volume>372</volume><fpage>n71</fpage><pub-id pub-id-type="doi">10.1136/bmj.n71</pub-id><pub-id pub-id-type="medline">33782057</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Vandenbroucke</surname><given-names>JP</given-names> </name><name name-style="western"><surname>von Elm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Altman</surname><given-names>DG</given-names> </name><etal/></person-group><article-title>Strengthening the reporting of observational studies in epidemiology (STROBE): explanation and elaboration</article-title><source>Ann Intern Med</source><year>2007</year><month>10</month><day>16</day><volume>147</volume><issue>8</issue><fpage>W163</fpage><lpage>W194</lpage><pub-id pub-id-type="doi">10.7326/0003-4819-147-8-200710160-00010-w1</pub-id><pub-id pub-id-type="medline">17938389</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>X</given-names> </name><name name-style="western"><surname>Cruz Rivera</surname><given-names>S</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Calvert</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Denniston</surname><given-names>AK</given-names> </name><collab>SPIRIT-AI and CONSORT-AI Working Group</collab></person-group><article-title>Reporting guidelines for clinical trial reports for interventions involving artificial intelligence: the CONSORT-AI extension</article-title><source>Nat Med</source><year>2020</year><month>09</month><volume>26</volume><issue>9</issue><fpage>1364</fpage><lpage>1374</lpage><pub-id pub-id-type="doi">10.1038/s41591-020-1034-x</pub-id><pub-id pub-id-type="medline">32908283</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Nazir</surname><given-names>A</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name></person-group><article-title>A comprehensive survey of ChatGPT: advancements, applications, prospects, and challenges</article-title><source>Meta Radiol</source><year>2023</year><month>09</month><volume>1</volume><issue>2</issue><fpage>100022</fpage><pub-id pub-id-type="doi">10.1016/j.metrad.2023.100022</pub-id><pub-id pub-id-type="medline">37901715</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dyachenko</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Humenna</surname><given-names>O</given-names> </name><name name-style="western"><surname>Soloviov</surname><given-names>O</given-names> </name><name name-style="western"><surname>Skarga-Bandurova</surname><given-names>I</given-names> </name><name name-style="western"><surname>Nenkov</surname><given-names>N</given-names> </name></person-group><article-title>LLM services in the management of social communications</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1474017</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1474017</pub-id><pub-id pub-id-type="medline">40151524</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hang</surname><given-names>CN</given-names> </name><name name-style="western"><surname>Wei Tan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Yu</surname><given-names>PD</given-names> </name></person-group><article-title>MCQGen: a large language model-driven MCQ generator for personalized learning</article-title><source>IEEE Access</source><year>2024</year><volume>12</volume><fpage>102261</fpage><lpage>102273</lpage><pub-id pub-id-type="doi">10.1109/ACCESS.2024.3420709</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Pitre</surname><given-names>T</given-names> </name><name name-style="western"><surname>Jassal</surname><given-names>T</given-names> </name><name name-style="western"><surname>Talukdar</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Shahab</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ling</surname><given-names>M</given-names> </name><name name-style="western"><surname>Zeraatkar</surname><given-names>D</given-names> </name></person-group><article-title>ChatGPT for assessing risk of bias of randomized trials using the rob 2.0 tool: a methods study</article-title><source>MedRxiv</source><comment>Preprint posted online on  Nov 22, 2023</comment><pub-id pub-id-type="doi">10.1101/2023.11.19.23298727</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roberts</surname><given-names>RH</given-names> </name><name name-style="western"><surname>Ali</surname><given-names>SR</given-names> </name><name name-style="western"><surname>Hutchings</surname><given-names>HA</given-names> </name><name name-style="western"><surname>Dobbs</surname><given-names>TD</given-names> </name><name name-style="western"><surname>Whitaker</surname><given-names>IS</given-names> </name></person-group><article-title>Comparative study of ChatGPT and human evaluators on the assessment of medical literature according to recognised reporting standards</article-title><source>BMJ Health Care Inform</source><year>2023</year><month>10</month><volume>30</volume><issue>1</issue><fpage>e100830</fpage><pub-id pub-id-type="doi">10.1136/bmjhci-2023-100830</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Anghelescu</surname><given-names>A</given-names> </name><name name-style="western"><surname>Firan</surname><given-names>FC</given-names> </name><name name-style="western"><surname>Onose</surname><given-names>G</given-names> </name><etal/></person-group><article-title>PRISMA systematic literature review, including with meta-analysis vs. chatbot/GPT (AI) regarding current scientific data on the main effects of the calf blood deproteinized hemoderivative medicine (Actovegin) in ischemic stroke</article-title><source>Biomedicines</source><year>2023</year><month>06</month><day>2</day><volume>11</volume><issue>6</issue><fpage>1623</fpage><pub-id pub-id-type="doi">10.3390/biomedicines11061623</pub-id><pub-id pub-id-type="medline">37371718</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sanmarchi</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bucci</surname><given-names>A</given-names> </name><name name-style="western"><surname>Nuzzolese</surname><given-names>AG</given-names> </name><etal/></person-group><article-title>A step-by-step researcher&#x2019;s guide to the use of an AI-based transformer in epidemiology: an exploratory analysis of ChatGPT using the STROBE checklist for observational studies</article-title><source>Z Gesundh Wiss</source><year>2023</year><month>05</month><day>26</day><fpage>1</fpage><lpage>36</lpage><pub-id pub-id-type="doi">10.1007/s10389-023-01936-y</pub-id><pub-id pub-id-type="medline">37361298</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="web"><article-title>OpenAI</article-title><source>ChatGPT</source><year>2023</year><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://openai.com/product/chatgpt">https://openai.com/product/chatgpt</ext-link></comment></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="web"><article-title>The claude 3 model family: opus, sonnet, haiku</article-title><source>Anthropic</source><year>2024</year><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://anthropic.com/reports/claude-3">https://anthropic.com/reports/claude-3</ext-link></comment></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>F</given-names> </name><name name-style="western"><surname>Schilsky</surname><given-names>RL</given-names> </name><name name-style="western"><surname>Page</surname><given-names>D</given-names> </name><etal/></person-group><article-title>Development and validation of a natural language processing tool to generate the CONSORT reporting checklist for randomized clinical trials</article-title><source>JAMA Netw Open</source><year>2020</year><month>10</month><day>1</day><volume>3</volume><issue>10</issue><fpage>e2014661</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2020.14661</pub-id><pub-id pub-id-type="medline">33030549</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Plana</surname><given-names>D</given-names> </name><name name-style="western"><surname>Shung</surname><given-names>DL</given-names> </name><name name-style="western"><surname>Grimshaw</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Saraf</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sung</surname><given-names>JJY</given-names> </name><name name-style="western"><surname>Kann</surname><given-names>BH</given-names> </name></person-group><article-title>Randomized clinical trials of machine learning interventions in health care: a systematic review</article-title><source>JAMA Netw Open</source><year>2022</year><month>09</month><day>1</day><volume>5</volume><issue>9</issue><fpage>e2233946</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2022.33946</pub-id><pub-id pub-id-type="medline">36173632</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mathew</surname><given-names>G</given-names> </name><name name-style="western"><surname>Agha</surname><given-names>R</given-names> </name><name name-style="western"><surname>Albrecht</surname><given-names>J</given-names> </name><etal/></person-group><article-title>STROCSS 2021: strengthening the reporting of cohort, cross-sectional and case-control studies in surgery</article-title><source>Int J Surg</source><year>2021</year><month>12</month><volume>96</volume><fpage>106165</fpage><pub-id pub-id-type="doi">10.1016/j.ijsu.2021.106165</pub-id><pub-id pub-id-type="medline">34774726</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Sandmann</surname><given-names>S</given-names> </name><name name-style="western"><surname>Riepenhausen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Plagwitz</surname><given-names>L</given-names> </name><name name-style="western"><surname>Varghese</surname><given-names>J</given-names> </name></person-group><article-title>Systematic analysis of ChatGPT, Google search and Llama 2 for clinical decision support tasks</article-title><source>Nat Commun</source><year>2024</year><month>03</month><day>6</day><volume>15</volume><issue>1</issue><fpage>2050</fpage><pub-id pub-id-type="doi">10.1038/s41467-024-46411-8</pub-id><pub-id pub-id-type="medline">38448475</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Carl&#x00E0;</surname><given-names>MM</given-names> </name><name name-style="western"><surname>Gambini</surname><given-names>G</given-names> </name><name name-style="western"><surname>Baldascino</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Exploring AI-chatbots&#x2019; capability to suggest surgical planning in ophthalmology: ChatGPT versus Google Gemini analysis of retinal detachment cases</article-title><source>Br J Ophthalmol</source><year>2024</year><month>09</month><day>20</day><volume>108</volume><issue>10</issue><fpage>1457</fpage><lpage>1469</lpage><pub-id pub-id-type="doi">10.1136/bjo-2023-325143</pub-id><pub-id pub-id-type="medline">38448201</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="web"><article-title>OpenAI</article-title><source>Best practices for prompt engineering with the OpenAI API</source><year>2023</year><access-date>2025-09-11</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-the-openai-api">https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-the-openai-api</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gartlehner</surname><given-names>G</given-names> </name><name name-style="western"><surname>Kahwati</surname><given-names>L</given-names> </name><name name-style="western"><surname>Hilscher</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Data extraction for evidence synthesis using a large language model: A proof-of-concept study</article-title><source>Res Synth Methods</source><year>2024</year><month>07</month><volume>15</volume><issue>4</issue><fpage>576</fpage><lpage>589</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1710</pub-id><pub-id pub-id-type="medline">38432227</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>H</given-names> </name><name name-style="western"><surname>Wu</surname><given-names>P</given-names> </name><name name-style="western"><surname>He</surname><given-names>H</given-names> </name></person-group><article-title>Kappa coefficient: a popular measure of rater agreement</article-title><source>Shanghai Arch Psychiatry</source><year>2015</year><month>02</month><day>25</day><volume>27</volume><issue>1</issue><fpage>62</fpage><lpage>67</lpage><pub-id pub-id-type="doi">10.11919/j.issn.1002-0829.215010</pub-id><pub-id pub-id-type="medline">25852260</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Wrightson</surname><given-names>JG</given-names> </name><name name-style="western"><surname>Blazey</surname><given-names>P</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>KM</given-names> </name><name name-style="western"><surname>Ardern</surname><given-names>CL</given-names> </name></person-group><article-title>GPT for RCTs?: using AI to determine adherence to reporting guidelines</article-title><source>Health Informatics</source><comment>Preprint posted online on 2023</comment><pub-id pub-id-type="doi">10.1101/2023.12.14.23299971</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Woelfle</surname><given-names>T</given-names> </name><name name-style="western"><surname>Hirt</surname><given-names>J</given-names> </name><name name-style="western"><surname>Janiaud</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kappos</surname><given-names>L</given-names> </name><name name-style="western"><surname>Ioannidis</surname><given-names>JPA</given-names> </name><name name-style="western"><surname>Hemkens</surname><given-names>LG</given-names> </name></person-group><article-title>Benchmarking human-AI collaboration for common evidence appraisal tools</article-title><source>J Clin Epidemiol</source><year>2024</year><month>11</month><volume>175</volume><fpage>111533</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2024.111533</pub-id><pub-id pub-id-type="medline">39277058</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Vorland</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Ying</surname><given-names>X</given-names> </name><etal/></person-group><article-title>SPIRIT-CONSORT-TM: a corpus for assessing transparency of clinical trial protocol and results publications</article-title><source>Sci Data</source><year>2025</year><month>02</month><day>28</day><volume>12</volume><issue>1</issue><fpage>355</fpage><pub-id pub-id-type="doi">10.1038/s41597-025-04629-1</pub-id><pub-id pub-id-type="medline">40021657</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Jiang</surname><given-names>L</given-names> </name><name name-style="western"><surname>Lan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Menke</surname><given-names>JD</given-names> </name><name name-style="western"><surname>Vorland</surname><given-names>CJ</given-names> </name><name name-style="western"><surname>Kilicoglu</surname><given-names>H</given-names> </name></person-group><article-title>Text classification models for assessing the completeness of randomized controlled trial publications based on CONSORT reporting guidelines</article-title><source>Sci Rep</source><year>2024</year><month>09</month><day>17</day><volume>14</volume><issue>1</issue><fpage>21721</fpage><pub-id pub-id-type="doi">10.1038/s41598-024-72130-7</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>C</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>B</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>RoBGuard: enhancing LLMS to assess risk of bias in clinical trial documents</article-title><year>2025</year><access-date>2025-09-22</access-date><conf-name>Proceedings of the 31st International Conference on Computational Linguistics</conf-name><conf-loc>Abu Dhabi</conf-loc><fpage>1258</fpage><lpage>1277</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://aclanthology.org/2025.coling-main.84/">https://aclanthology.org/2025.coling-main.84/</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Retrospective protocol for the study.</p><media xlink:href="jmir_v27i1e72412_app1.docx" xlink:title="DOCX File, 21 KB"/></supplementary-material><supplementary-material id="app2"><label>Multimedia Appendix 2 </label><p>The prompts of ChatGPT and Claude.</p><media xlink:href="jmir_v27i1e72412_app2.docx" xlink:title="DOCX File, 19 KB"/></supplementary-material><supplementary-material id="app3"><label>Multimedia Appendix 3</label><p>Study adherence to CONSORT-AI guidelines.</p><media xlink:href="jmir_v27i1e72412_app3.xlsx" xlink:title="XLSX File, 23 KB"/></supplementary-material><supplementary-material id="app4"><label>Multimedia Appendix 4 </label><p>The overall compliance scores of 41 included RCTs. The error bars represent the standard error.</p><media xlink:href="jmir_v27i1e72412_app4.docx" xlink:title="DOCX File, 18 KB"/></supplementary-material><supplementary-material id="app5"><label>Multimedia Appendix 5 </label><p>Consistency of content generated by GPT-4 variants compared to human (a) by study, (b) by item.</p><media xlink:href="jmir_v27i1e72412_app5.docx" xlink:title="DOCX File, 22 KB"/></supplementary-material><supplementary-material id="app6"><label>Multimedia Appendix 6 </label><p>Cohen &#x03BA; values for comparisons between different models and the gold standard.</p><media xlink:href="jmir_v27i1e72412_app6.docx" xlink:title="DOCX File, 17 KB"/></supplementary-material></app-group></back></article>