<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="review-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e81597</article-id><article-id pub-id-type="doi">10.2196/81597</article-id><article-categories><subj-group subj-group-type="heading"><subject>Review</subject></subj-group></article-categories><title-group><article-title>Artificial Intelligence Tools for Automating Evidence Synthesis: Scoping Review</article-title></title-group><contrib-group><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Harasgama</surname><given-names>Sashika</given-names></name><degrees>BSc, MScPH, MD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pearce</surname><given-names>Helen</given-names></name><degrees>BA</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Appel</surname><given-names>Cameron</given-names></name><degrees>BAc, MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Loftus</surname><given-names>Liam</given-names></name><degrees>BM BCh</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Painter</surname><given-names>Helena</given-names></name><degrees>DTM&#x0026;H, BSc, MBChB</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Kuhn</surname><given-names>Isla</given-names></name><degrees>MA, MSc</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Karpusheff</surname><given-names>Justine</given-names></name><degrees>PGCE, MSc, PhD</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ceesay</surname><given-names>Aji</given-names></name><degrees>BA, MSc</degrees><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Ford</surname><given-names>John</given-names></name><degrees>DTM&#x0026;H, MBChB, MSc, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib></contrib-group><aff id="aff1"><institution>Wolfson Institute of Population Health, Queen Mary University of London</institution><addr-line>Whitechapel Campus</addr-line><addr-line>London</addr-line><country>United Kingdom</country></aff><aff id="aff2"><institution>Medical Library, University of Cambridge</institution><addr-line>Cambridge</addr-line><country>United Kingdom</country></aff><aff id="aff3"><institution>The Health Foundation</institution><addr-line>London</addr-line><country>United Kingdom</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Mavragani</surname><given-names>Amaryllis</given-names></name></contrib><contrib contrib-type="editor"><name name-style="western"><surname>Brini</surname><given-names>Stefano</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Ting</surname><given-names>Eon</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Khalil</surname><given-names>Hanan</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Sashika Harasgama, BSc, MScPH, MD, Wolfson Institute of Population Health, Queen Mary University of London, Whitechapel Campus, London, E1 2AD, United Kingdom, +44 (0)20 7882 5555; <email>sashika.harasgama@gmail.com</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>30</day><month>3</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e81597</elocation-id><history><date date-type="received"><day>01</day><month>08</month><year>2025</year></date><date date-type="rev-recd"><day>27</day><month>10</month><year>2025</year></date><date date-type="accepted"><day>03</day><month>11</month><year>2025</year></date></history><copyright-statement>&#x00A9; Sashika Harasgama, Helen Pearce, Cameron Appel, Liam Loftus, Helena Painter, Isla Kuhn, Justine Karpusheff, Aji Ceesay, John Ford. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 30.3.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e81597"/><abstract><sec><title>Background</title><p>Rapidly and accurately synthesizing large volumes of evidence is a time- and resource-intensive process. Once published, reviews often risk becoming outdated, limiting their usefulness for decision makers. Recent advancements in artificial intelligence (AI) have enabled researchers to automate stages of the evidence synthesis process, from literature searching and screening to data extraction and analysis. As previous reviews on this topic have been published, a significant number of tools have been further developed and evaluated. Furthermore, as generative AI increasingly automates evidence synthesis, understanding how it is studied and applied is crucial, given both its benefits and risks.</p></sec><sec><title>Objective</title><p>This review aimed to map the current landscape of evaluated AI tools used to automate evidence synthesis.</p></sec><sec sec-type="methods"><title>Methods</title><p>Following the Joanna Briggs Institute methodology for scoping reviews, we searched Ovid MEDLINE, Ovid Embase, Scopus, and Web of Science in February 2025 and conducted a gray literature search in April 2025. We included articles published in any language from January 2021 onward. Two reviewers independently screened citations using Rayyan, and data were extracted based on study design and key AI-related technical features.</p></sec><sec sec-type="results"><title>Results</title><p>We identified 7841 unique citations through database searches and 19 records through gray literature searching. A total of 222 articles were included in the review. We identified 65 AI tools and 25 open-source models or machine learning (ML) algorithms that automate parts of or the whole evidence synthesis pathway. A total of 54.1% (n=120) of the studies were published in 2024, reflecting a trend toward researching general-purpose large language models (LLMs) for evidence synthesis automation. The most popular tool studied was generative pretrained transformer models, including its conversational interface ChatGPT (n=70, 31.5%). Moreover, 31.1% (n=69) studied tools automated by traditional ML algorithms. No studies compared traditional ML tools to LLM-based tools. In addition, 61.7% (n=137) and 26.1% (n=58) studied AI-assisted automation of title and abstract screening and data extraction, respectively, the 2 most intensive stages and, therefore, amenable to automation. Technical performance outcomes were the most frequently reported, with only 4.1% (n=9) of studies reporting time- or workload-specific outcomes. Few studies pragmatically evaluated AI tools in real-world evidence synthesis settings.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>This review comprehensively captures the broad, evolving suite of AI automation tools available to support evidence synthesis, leveraged by increasingly complex AI approaches that range from traditional ML to LLMs. The notable shift toward studying general-purpose generative AI tools reflects how these technologies are actively transforming evidence synthesis practice. The lack of studies in our review comparing different AI approaches for specific automation stages or evaluating their effectiveness pragmatically represents a significant research gap. Optimal tool selection will likely depend on the review topic and methodology and researcher priorities. While they offer potential for reducing workload, ongoing evaluation to mitigate AI bias and to ensure the integrity of reviews is essential for safeguarding evidence-based decision-making.</p></sec></abstract><kwd-group><kwd>artificial intelligence</kwd><kwd>machine learning</kwd><kwd>large language models</kwd><kwd>automation</kwd><kwd>evidence synthesis</kwd><kwd>systematic reviews as a topic</kwd><kwd>ChatGPT</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><p>High-quality evidence synthesis is essential for guiding policy and practice. However, producing such evidence reviews is not only time- and resource-intensive but also challenging to keep up to date due to the volume of studies being published each year. The recent COVID-19 pandemic highlighted the challenges of having accurate, contemporaneous, and rapidly synthesized data, and the opportunities that automation presents in mitigating these [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref2">2</xref>]. Previous studies have estimated that an average systematic review can take approximately 67 weeks to complete [<xref ref-type="bibr" rid="ref3">3</xref>], often too long for decision makers. The evolving language and text capabilities of artificial intelligence (AI) have increased the scope of automation within the evidence review process, with studies showing that automated tools can help complete systematic reviews in days to weeks [<xref ref-type="bibr" rid="ref4">4</xref>,<xref ref-type="bibr" rid="ref5">5</xref>] and significantly reduce workload [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>].</p><p>The advent of evidence synthesis automation was driven by machine learning (ML), the technology that underpins most AI evidence synthesis tools today. ML is a subset of AI that focuses on creating algorithmic systems that can learn from data, recognize patterns, and make decisions, often with model characteristics manually selected by humans [<xref ref-type="bibr" rid="ref8">8</xref>]. An AI discipline called natural language processing enables computers to understand, interpret, and work with human language (such as written text) and, when used in conjunction with ML, can automate certain language-related tasks [<xref ref-type="bibr" rid="ref8">8</xref>]. Deep learning is a subset of ML that uses neural networks (computational models that consist of layers of interconnected processing units) to automatically learn more complex patterns from large datasets [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Time- and workload-intensive steps within the evidence synthesis pathway, particularly title and abstract screening and data extraction, have been made more efficient through automation tools using machine and deep learning. However, the introduction of generative AI (ie, AI that can autonomously produce text, speech, or other outputs) in the form of commercially available large language models (LLMs), such as ChatGPT (OpenAI) or Claude (Anthropic), has created an opportunity for more creative and complex evidence synthesis automation. Studies are currently trialing LLM-based methods to automate individual stages, such as data extraction, to entire clinical evidence synthesis pipelines [<xref ref-type="bibr" rid="ref9">9</xref>,<xref ref-type="bibr" rid="ref10">10</xref>].</p><p>LLMs are leveraged by deep learning models called transformers, which use &#x2018;attention&#x2019; mechanisms to determine how important each word is in relation to others. They are trained on massive amounts of written data and can capture nuance and context in language more effectively than traditional ML models. Another popular automation approach uses transformers called Bidirectional Encoder Representations from Transformers (BERT), which reads inputs bidirectionally and is pretrained on large sets of text data and then fine-tuned to excel at domain-specific tasks. These newer AI tools may outperform current automation tools in some tasks; however, researchers are still exploring their potential for evidence synthesis, and their performance is yet to be thoroughly validated over time [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref12">12</xref>]. In fact, the use of advanced and generative AI approaches is not without contention and legitimate risks, with the research community fearing loss of academic integrity in the process [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref14">14</xref>].</p><p>Reviews published to date on this topic have primarily focused on ML tools and approaches used for automating systematic reviews in particular [<xref ref-type="bibr" rid="ref12">12</xref>,<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>]. For example, Khalil et al [<xref ref-type="bibr" rid="ref15">15</xref>] performed a scoping review of automation tools for systematic reviews up to mid-2021 and found 10 validated tools that all used ML. Jimenez et al [<xref ref-type="bibr" rid="ref16">16</xref>] undertook a mapping review of ML tools to assist with systematic reviews and identified 63 tools at the time of publication in December 2022. Khalil et al [<xref ref-type="bibr" rid="ref18">18</xref>] also conducted a review in 2024 on automation tools for scoping reviews. While these reviews were comprehensive, the rate at which AI technology evolves often means they themselves risk being outdated. A more recent review published by Lieberum et al [<xref ref-type="bibr" rid="ref12">12</xref>] focused solely on the use of LLMs for performing systematic reviews and found that in half the included studies, LLMs had promising applications, particularly in screening. However, the authors noted that despite the optimism, LLMs are still not quite ready for direct integration into research practice.</p><p>The rapid development and likely increasing adoption of generative AI tools to automate evidence synthesis and inform policy and practice warrant a more contemporaneous scoping of the literature. It presents significant risks to the quality and methodological rigor of evidence synthesis, despite its opportunities. Therefore, it is essential to understand the breadth and scope of AI use in this advancing field.</p><p>In this scoping review, we aimed to systematically map AI tools available for all types of evidence synthesis, across all its stages, and to describe the current landscape by exploring the underlying automation approaches used by the tools and identifying trends in automated evidence synthesis research.</p></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>Ethical approval was not required for this scoping review, as it involved analysis of previously published studies and did not include primary data collection.</p></sec><sec id="s2-2"><title>Review Design</title><p>We used the Joanna Briggs Institute methodological guidance for scoping reviews [<xref ref-type="bibr" rid="ref19">19</xref>]. Using the &#x201C;PCC&#x201D; mnemonic (population, concept, and context), our review aimed to focus on current AI tools (<italic>P</italic>) used to automate (<italic>C</italic>) the evidence synthesis pathway (<italic>C</italic>). The reporting of this paper was guided by the PRISMA-ScR (Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews) checklist [<xref ref-type="bibr" rid="ref20">20</xref>], available in <xref ref-type="supplementary-material" rid="app2">Checklist 1</xref>. Our review was not registered a priori in PROSPERO.</p><p>We aimed to identify studies across steps of the evidence synthesis pathway: (1) searching for the evidence, (2) screening the evidence, (3) extracting the data, (4) assessing the quality of the evidence, (5) analyzing the data, and (6) writing the review. We also aimed to identify which AI methods or techniques were most leveraged for tool development and to determine which parts of the evidence synthesis pathway were the most automated. Our secondary objectives included discussing outcomes typically used to assess performance.</p></sec><sec id="s2-3"><title>Search Strategy</title><p>The search strategy was adapted from 2 prior reviews&#x2019; search strategies [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>] and iteratively refined through an initial search of Ovid MEDLINE, based on analysis of titles, abstracts, and keywords returned as well as cross-referencing with key articles. The final search was executed across four databases: Ovid MEDLINE, Ovid Embase, Scopus, and Web of Science. We searched for studies published from January 1, 2021, to February 14, 2025, with no language restrictions. Details of our search strategy are available in Part S1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. Our search strategy combined three groups of search terms: (1) evidence review and systematic review terms; (2) AI methods, including ML, deep learning, and other techniques; and (3) terms reflecting the act of automation, such as &#x201C;support,&#x201D; &#x201C;assist,&#x201D; and &#x201C;perform.&#x201D; We also conducted a gray literature search in April 2025 after data extraction of our included studies using Google&#x2019;s search engine and citation snowballing methods with the tool Litmaps [<xref ref-type="bibr" rid="ref21">21</xref>] to identify further relevant studies from our included studies.</p></sec><sec id="s2-4"><title>Eligibility Criteria</title><p>Full details of the eligibility criteria are displayed in <xref ref-type="other" rid="box1">Textbox 1</xref>. All primary and secondary studies that evaluated the use of AI tools for evidence synthesis in health and care research were included. We did not exclude studies if they did not pertain to health and care research specifically; however, we excluded studies if the AI tool was deemed not transferable to that setting. Studies that did not provide sufficient evaluation of the tool, through technical or other performance-related metrics, were also excluded.</p><boxed-text id="box1"><title> Eligibility criteria using the population, intervention, comparison, and outcome (PICO) framework.</title><p><bold>Inclusion criteria</bold></p><list list-type="bullet"><list-item><p>Population and setting</p><list list-type="bullet"><list-item><p>Primary or secondary studies that evaluate the use of artificial intelligence (AI) tools for evidence synthesis in health and care research</p></list-item><list-item><p>Tool developed in any country</p></list-item></list></list-item><list-item><p>Intervention</p><list list-type="bullet"><list-item><p>AI tools defined as having two characteristics: adaptivity and autonomy (based on the Department for Science, Innovation and Technology guidance)</p></list-item></list></list-item><list-item><p>Comparison</p><list list-type="bullet"><list-item><p>If comparison, to manual research methods</p></list-item><list-item><p>If comparison, to other automated tools</p></list-item></list></list-item><list-item><p>Outcome</p><list list-type="bullet"><list-item><p>Any technical or performance-related outcome assessing the AI tool</p></list-item><list-item><p>Would be dependent on evidence synthesis task but could include outcomes such as</p><list list-type="bullet"><list-item><p>Sensitivity, specificity, precision, and area under the curve</p></list-item><list-item><p>Number of relevant studies identified in screening</p></list-item><list-item><p>Quality of data extraction</p></list-item><list-item><p>Accuracy of meta-analysis</p></list-item><list-item><p>Quality and accuracy of writing of evidence synthesis report</p></list-item></list></list-item></list></list-item></list><p><bold>Exclusion criteria</bold></p><list list-type="bullet"><list-item><p>Population or setting</p><list list-type="bullet"><list-item><p>Studies do not provide any evaluation of AI tools (only descriptions)</p></list-item></list></list-item><list-item><p>Intervention</p><list list-type="bullet"><list-item><p>Automation tools that are not AI or where it is unclear if it is AI</p></list-item></list></list-item></list></boxed-text><p>We defined AI according to the regulatory definition provided by the UK Government Department of Science, Innovation and Technology&#x2019;s policy paper [<xref ref-type="bibr" rid="ref22">22</xref>]. It defines AI as having two distinct characteristics of adaptivity and autonomy, meaning that AI can continually learn and infer patterns not envisioned by human programmers and can make decisions without the intent or oversight of a human. We excluded studies that discussed automated tools but did not display these characteristics. We also defined evidence synthesis according to the Cochrane definition, which &#x201C;involves combining information from multiple studies investigating the same topic to comprehensively understand their findings&#x201D; [<xref ref-type="bibr" rid="ref23">23</xref>]. No studies were excluded based on comparators or outcomes.</p><p>Citations identified through our search were imported and deduplicated using EndNote [<xref ref-type="bibr" rid="ref24">24</xref>], with Rayyan [<xref ref-type="bibr" rid="ref25">25</xref>] used to identify further duplicates. Using Rayyan for the entire screening process, records were initially screened via title and abstract by 1 reviewer. Full-text articles were then uploaded and screened for eligibility by another reviewer, with queries or discrepancies resolved by discussion with a third researcher.</p></sec><sec id="s2-5"><title>Data Extraction and Synthesis</title><p>Data from included studies were extracted into a Microsoft Excel template. Extracted information included baseline study characteristics, such as authors, year of publication, study type, tool name, and outcomes measured. We also extracted specific information regarding the AI method used for tool development, web links to the tools or source code if available, as well as paywall features of the tool if relevant. Consistent with Joanna Briggs Institute scoping review methodology, we did not perform a critical appraisal of the included studies. Findings were thematically synthesized and presented visually, alongside being narratively described with descriptive statistics.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Overview</title><p>A total of 11,226 studies were retrieved from database searches, with 3385 citations removed as duplicates (<xref ref-type="fig" rid="figure1">Figure 1</xref>). A total of 7841 citations were screened by their title and abstract, and 7499 were excluded after being deemed irrelevant. The remaining 342 citations underwent full-text screening, and 208 articles were included based on our eligibility criteria. An additional 19 records were identified through snowballing methods and a gray literature search, with 14 records being included. In total, 222 articles were included in the final review.</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>PRISMA (Preferred Reporting Items for Systematic Reviews and Meta-Analyses) flow diagram illustrating the study selection process, including the number of records identified, screened, assessed for eligibility, and included in the final review. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81597_fig01.png"/></fig></sec><sec id="s3-2"><title>Study Characteristics</title><p>We included 206 (92.8%) primary studies and 16 (7.2%) secondary studies, with 2 (0.9%) stating they were explicitly AI assisted in their methods [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref27">27</xref>]. Three (1.4%) studies included were written in a language other than English [<xref ref-type="bibr" rid="ref26">26</xref>,<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. A total of 11.7% (26/222) of studies were conference abstracts or posters. We identified 65 (29.3%) distinct AI tools and 25 (11.3%) open-source models or algorithms to automate the evidence synthesis process (Table S3 and S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). Forty-two (18.9%) of the included studies had no specific tool name or data availability for their model.</p><p>Included studies could be divided into three broad categories: (1) methodological studies whereby researchers develop a novel algorithm or model and evaluate its effectiveness for automation; (2) evaluation, diagnostic accuracy, or feasibility studies of existing tools; and (3) comparative studies comparing the performance of AI tools to human researchers or similar tools executing the same task. Ten (4.5%) studies compared the performance of popular general-purpose LLMs, notably ChatGPT, Claude, and Gemini, across various tasks, including screening and writing [<xref ref-type="bibr" rid="ref30">30</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. No studies compared traditional ML tools to LLM-based tools. There was only 1 (0.5%) study that evaluated an AI tool using a randomized trial study design [<xref ref-type="bibr" rid="ref41">41</xref>].</p><p>A total of 54.1% (120/222) of included studies were published in 2024, followed by 16.7% (37/222) in 2023 and 14% (31/222) in 2022. Traditional ML, as an automation method, has remained relatively stable across the years, consistently supporting screening and extraction tasks (<xref ref-type="fig" rid="figure2">Figure 2</xref>). The use of BERTs has gradually increased since 2021, likely reflecting a growing interest in fine-tuned transformer models for domain-specific tasks. General-purpose LLMs saw a steep increase in 2024, likely coinciding with the widespread availability and maturity of models such as GPT-4 and Claude.</p><fig position="float" id="figure2"><label>Figure 2.</label><caption><p>Year-on-year change in the study of three artificial intelligence methods for evidence synthesis automation. BERT: Bidirectional Encoder Representations from Transformers; LLM: large language model; ML: machine learning.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81597_fig02.png"/></fig></sec><sec id="s3-3"><title>Evidence Synthesis Pathway</title><p>Many studies explored multiple areas of the evidence synthesis pathway, with most studies focusing on title and abstract screening (137/222, 61.7%), followed by data extraction (58/222, 26.1%), then literature searching (42/222, 18.9%; <xref ref-type="fig" rid="figure3">Figure 3</xref>). The proportion of studies per evidence synthesis stage reflects how amenable it is to automation. We categorized all 65 tools by evidence synthesis stage in Table S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><fig position="float" id="figure3"><label>Figure 3.</label><caption><p>Percentage of studies examining the automation of distinct evidence synthesis stages using artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81597_fig03.png"/></fig><p>In total, 52.7% (117/222) of included studies researched tools leveraged by transformers. Some examples include general-purpose LLMs such as OpenAI&#x2019;s generative pretrained transformer (GPT) models or Mistral&#x2019;s 8&#x00D7;22B, as well as BERT models such as BioBERT or PubMedBERT (pretrained on predominantly biomedical text) that were fine-tuned for specific classification or screening tasks. In addition, 31.1% (69/222) evaluated tools underpinned by traditional ML algorithms alone, such as RCT Tagger, Research Screener, and Abstrackr. Some tools used an ensemble of ML and transformer-based methods, such as ASReview.</p></sec><sec id="s3-4"><title>Evolution of AI Methods</title><p>A range of AI approaches were used to automate the evidence synthesis pathway, and these have evolved in computational power or complexity. The schematic in <xref ref-type="fig" rid="figure4">Figure 4</xref> demonstrates two distinct categories of learning: traditional ML using classifier or clustering algorithms and deep learning leveraged by classical neural networks or transformers. Transformers are further divided into their subcategories of BERTs and general-purpose LLMs (ie, not designed specifically for evidence synthesis). Language and text processing abilities of the AI tools increase with complexity and computational power.</p><fig position="float" id="figure4"><label>Figure 4.</label><caption><p>Schematic overview of artificial intelligence methods for evidence synthesis, ranging from traditional machine learning to neural networks and transformer-based large language models. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81597_fig04.png"/></fig><p>Both traditional ML and general-purpose LLMs were equally used to automate screening, with a similar distribution noted in data extraction (<xref ref-type="fig" rid="figure5">Figure 5</xref>). Traditional ML approaches being frequently leveraged for these stages likely highlight their ongoing suitability for evidence synthesis. Meanwhile, general-purpose LLMs can be applied across the entire evidence synthesis pathway. Generative LLMs are the only AI tool at present capable of producing text and therefore solely represent the automation of writing in evidence synthesis (<xref ref-type="fig" rid="figure5">Figure 5</xref>).</p><fig position="float" id="figure5"><label>Figure 5.</label><caption><p>Visualization of included studies depicting the approximate number of studies by evidence synthesis stage and automation method. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e81597_fig05.png"/></fig></sec><sec id="s3-5"><title>Distribution of Commonly Studied Tools</title><p>Of 222 studies, 70 (31.5%) used GPT models or ChatGPT. Other most cited tools included Anthropic&#x2019;s Claude (16/222, 7.2%), Rayyan (13/222, 5.9%), ASReview (12/222, 5.4%), Abstrackr (9/222, 4.1%), Google&#x2019;s Gemini (8/222, 3.6%), Covidence (5/222, 2.3%), DistillerSR (6/222, 2.7%), Colandr (3/222, 1.4%), and EPPI Reviewer (2/222, 0.9%). An alphabetized list of tools and their categorization across evidence synthesis tasks is available in Tables S3 and S5 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. A list of all open-source models and algorithms is also available in Table S4 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s3-6"><title>Outcomes</title><p>The most common outcomes measured were those designed to assess the technical performance of AI tools and models. Sensitivity and recall, which are the same measure, were used in 22.5% (50/222) and 23% (51/222) of studies, respectively. Other frequently reported metrics included specificity, accuracy, precision, area under the curve or area under the receiver operating characteristic curve, and F<sub>1</sub>-score.</p><p>Using screening as an example, the area under the curve or the area under the receiver operating characteristic curve measures overall classification performance, indicating how well a model can distinguish between studies for inclusion versus exclusion. The <italic>F</italic><sub>1</sub>-score measures the balance between precision (how many papers selected by the model were actually relevant) and recall (how many relevant papers were successfully identified).</p><p>We also found specific outcomes designed to assess the researchers&#x2019; workload or time saved. A popular outcome measure included Work Saved Over Sampling (WSS), which quantifies the percentage of manual screening work that can be avoided by using an AI tool while still maintaining the ability to identify all or most relevant studies. For example, a WSS of 30% means researchers can skip reviewing 30% of the citations while still identifying all relevant studies. Of 222 studies, 9 (4.1%) studies used WSS as a metric to assess the effectiveness of their automated screening tool [<xref ref-type="bibr" rid="ref42">42</xref>-<xref ref-type="bibr" rid="ref50">50</xref>].</p><p>Agreement between human researchers and AI tools, measured using metrics such as Cohen &#x03BA;, was also used to assess the validity of AI decisions, although few included studies (12/222, 5.4%) used this approach.</p></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This scoping review builds on previous reviews by comprehensively mapping the latest automation tools available across the evidence synthesis pathway and makes an important contribution to understanding how these tools are currently being researched and integrated into evidence synthesis workflows. We identified 65 distinct AI tools, some targeting specific aspects of the evidence synthesis pathway and others covering multiple stages, along with 25 open-source ML algorithms or BERT models designed or fine-tuned for evidence synthesis automation tasks. While published studies have traditionally focused on ML approaches for title and abstract screening and data extraction&#x2014;the stages most readily automated&#x2014;recent years have seen a notable shift toward transformer-based approaches for these same stages. Particularly in the past 2 years, there has been a rapid increase in published articles exploring the use of general-purpose LLMs, such as GPT, for evidence synthesis, despite these models not being developed specifically for it. This is notable given the risks generative AI poses due to its training methods and bias. We found that far fewer studies focused on optimizing the search for relevant studies or the analysis of the data or literature.</p></sec><sec id="s4-2"><title>What the Results Mean</title><p>While transformer-based approaches are expanding the types of complex tasks that can be automated, task-specific ML methods continue to be developed and evaluated for well-defined problems. For example, priority screening using ML-based approaches has been validated thoroughly in the literature [<xref ref-type="bibr" rid="ref43">43</xref>,<xref ref-type="bibr" rid="ref51">51</xref>]. On the basis of the results from our review, a wide range of AI tools are being continually deployed and studied at different points across the evidence synthesis pathway, likely reflecting that no one tool or approach is superior to the other and that there is still room for improvement within evidence synthesis automation. While several studies attempted to compare tools with similar underlying AI methods, the lack of studies comparing distinct AI approaches, particularly traditional ML- to LLM-based tools, means we have no clear understanding of their comparative effectiveness.</p><p>It is likely that the choice of AI tools will be shaped by the expertise of the evidence synthesized, as well as the purpose and aim of the review itself. Extensively validated traditional ML tools or BERT models fine-tuned to specific research domains may be preferred by researchers or organizations who have a well-defined evidence synthesis task for automation, due to their reliability. For example, researchers conducting systematic reviews of drug or intervention effectiveness may find several well-evaluated tools at their disposal, as most available ML and BERT-based tools excel at interpreting and extracting data from randomized trials with clear eligibility criteria.</p><p>General-purpose LLMs, on the other hand, offer flexible language understanding and generative capabilities that can assist researchers in rapidly processing large volumes of text, identifying relevant information, and synthesizing preliminary insights across diverse topics. While these tools may allow for more nuanced interactions with complex evidence, they are likely less acceptable to traditional academic standards.</p><p>Most tools identified in this review were designed to optimize systematic reviews, predominantly in the biomedical domain. This is likely because of the very strict methodology that is applied to such reviews, which allows for replicability and, therefore, optimal comparison to the gold standard, manually performed reviews. Limited research explored AI automation in complex, multidisciplinary fields such as public health or social sciences, and few tools addressed nonsystematic review methods, such as narrative, realist, or integrative reviews. This disparity likely reflects a difference in data structure; more standardized data found in quantitative reviews is easier to process and is more machine readable than qualitative data, which tends to be heterogeneous and context dependent.</p><p>BERT models continue to be used as a method of automation for screening and data extraction-related tasks, with popular models such as SciBERT and BioBERT being fine-tuned to perform classification tasks for biomedical research questions notably. There is early emerging evidence, however, that general-purpose LLMs are equal if not superior to BERTs at most evidence synthesis tasks [<xref ref-type="bibr" rid="ref37">37</xref>,<xref ref-type="bibr" rid="ref52">52</xref>,<xref ref-type="bibr" rid="ref53">53</xref>]. This potentially indicates to researchers that general-purpose LLMs, even if not designed specifically for evidence synthesis, may be a suitable option for automation if unable to invest the time and resources required to fine-tune BERT models.</p><p>Most studies included in our review used technical outcomes to assess tool performance, rather than directly comparing outcomes between AI tools and human researchers performing the same task. While sensitivity and specificity measures are important, there are implications for feasibility if studies do not assess AI tools pragmatically. Adoption of AI tools is likely going to be higher if there is a clear indication that the tool will save a researcher&#x2019;s time and improve workload for similar outputs.</p></sec><sec id="s4-3"><title>Comparison With Previous Literature</title><p>Similar to the findings in Khalil and Jimenez&#x2019;s reviews [<xref ref-type="bibr" rid="ref15">15</xref>,<xref ref-type="bibr" rid="ref16">16</xref>], we found that the task most amenable to automation within evidence synthesis was title and abstract screening, with 62% of included studies exploring both ML and deep learning tools to automate this stage. We also found that most ML-based tools identified previously in both reviews remain active and usable today, with several having significant updates incorporating more advanced AI methods such as EPPI Reviewer, Covidence, and Rayyan. Our review was also uniquely able to identify several LLM-based tools that are built specific for evidence synthesis, such as Elicit, Scite, and Consensus. We also found in our review that general-purpose LLMs were predominantly applied to screening and searching of the literature, with GPT being the most frequently cited general-purpose LLM, consistent with Lieberum&#x2019;s review [<xref ref-type="bibr" rid="ref12">12</xref>].</p><p>Unlike previous reviews, however, which studied ML- and LLM-based tools in isolation, our review demonstrates comparatively the strong shift in the evidence base to studying general-purpose LLMs for evidence synthesis tasks in the last 5 years, particularly for screening, extraction, and, uniquely, writing-related tasks. However, the evidence base for AI-generated or enhanced scientific writing is still evolving and remains a widely debated topic in the research community due to ethical and logistical issues attached to the practice [<xref ref-type="bibr" rid="ref54">54</xref>-<xref ref-type="bibr" rid="ref57">57</xref>]. Most journals at present ask for disclosure of AI assistance in the writing process.</p><p>Our review also uniquely sought to identify comparative studies between different AI approaches, particularly between traditional ML- and LLM-based tools, as previous reviews focused on distinct approaches in isolation. However, we were unable to identify any such studies within our search time frame, highlighting an important gap in the research. Recent domain-specific evidence has shown that LLM-based tools may outperform ML tools in evidence synthesis tasks such as screening, although these findings are limited due to a lack of generalizability [<xref ref-type="bibr" rid="ref58">58</xref>].</p></sec><sec id="s4-4"><title>Strengths and Limitations</title><p>There are several strengths to this scoping review. First, it offers a broad and comprehensive overview of the field, including over 200 studies. We clearly mapped AI tool use across the entire evidence synthesis pathway, providing complete lists of tools available for specific or multiple tasks. It also uses previous reviews&#x2019; frameworks to expand on the topic of automation in evidence synthesis, adding consistently to this evidence base. The search strategy was robust, covering multiple academic databases as well as gray literature sources, increasing the likelihood of capturing both peer-reviewed and nontraditional publications.</p><p>However, there are several limitations. Given the rapidly evolving nature of AI in evidence synthesis, it is possible that several recent publications were not captured. There is also temporal bias: general-purpose LLMs have only gained widespread adoption in the past 2 years, meaning fewer high-quality peer-reviewed studies are available compared to evaluations of older AI approaches. In addition, this review included studies that explicitly evaluated AI methods or tools to automate evidence synthesis, thereby excluding studies that may have researched AI without clear or proper disclosure of it. This potentially limited our evidence base and underestimated the actual AI usage in evidence synthesis. Additionally, a lack of granularity in reporting the underlying AI methods or technical approaches restricted the meaningful categorization and comparison of tools. Poor reporting of tool development methods also limits generalizability to health and care, particularly when models were trained within specific research domains. Publication bias may also exist, as tools developed in academic settings are more likely to appear in scholarly journals, whereas commercial tools may be underrepresented in the literature. Finally, an inherent limitation of scoping review methodology is the lack of critical appraisal of the evidence base, which limits our understanding of the quality of studies included.</p></sec><sec id="s4-5"><title>Recommendations for Policy and Practice</title><p>Until we can put in place mitigations that are effective in reducing error and bias in AI tools, there will be a need for a human researcher in the loop [<xref ref-type="bibr" rid="ref59">59</xref>]. Therefore, the tools highlighted in this paper should be used as adjuncts to humans, rather than replacements, to maintain high-quality and rigorous reviews.</p><p>Funders and researchers should continue to support efforts to build an evidence base in this field that can provide a feedback loop for developers to improve upon the tools&#x2019; effectiveness, feasibility, and acceptability. In particular, assessing these tools in real-world settings and comparing traditional ML methods to newer LLM-based approaches is pertinent to understanding automation opportunities and challenges. Researchers and funders should foster knowledge sharing on the effective use of emerging AI tools, such as through the International Collaboration of the Automation of Systematic Reviews [<xref ref-type="bibr" rid="ref60">60</xref>]. This can be through training resources, collaborative learning opportunities, and sharing best practices. This will help build collective understanding and capacity across the research community.</p><p>Almost all tools found had paid upgrades or monthly subscriptions. The lack of comprehensive tools that are free to use is concerning from both an accessibility and equity angle and is likely to influence how such tools are integrated into research practice.</p><p>Quality standards are needed to ensure ethical tool development, and researchers and stakeholders should adhere to guidelines that stipulate methods to properly and appropriately integrate automation tools into evidence synthesis, such as the recommendations and guidance for Responsible AI in Evidence Synthesis [<xref ref-type="bibr" rid="ref61">61</xref>].</p><p>Decision makers who use AI-assisted evidence syntheses should be aware of the issues around error and bias and should exercise caution when using reviews generated by tools that claim to perform the whole process.</p><p>LLMs also have significant electricity demands due to their scale, data requirements, and computational demands [<xref ref-type="bibr" rid="ref62">62</xref>,<xref ref-type="bibr" rid="ref63">63</xref>]. Estimates suggest that generative AI processing, from data center operation and cooling, could consume over 8% of the United States&#x2019; electricity and 5% of Europe&#x2019;s by 2030 [<xref ref-type="bibr" rid="ref64">64</xref>]. Furthermore, manufacturing generative processing units poses significant environmental impacts, particularly for communities near production like in Taichung, Taiwan [<xref ref-type="bibr" rid="ref64">64</xref>]. While some developers are exploring LLMs using renewable energy, such developments remain in their infancy [<xref ref-type="bibr" rid="ref65">65</xref>].</p><p>In the setting of many organizations and institutions requiring consideration of sustainability in their practice, it is important that we review the environmental implications of LLMs if implemented regularly into evidence synthesis workflows. Therefore, we recommend that sustainability research be undertaken around the use of such AI tools, which is not at all considered in the current evidence base.</p></sec><sec id="s4-6"><title>Conclusions</title><p>Having accurate, high-quality, and contemporaneous reviews of evidence is essential to making the most informed decisions about health and care. Many of the processes involved in evidence synthesis&#x2014;particularly identifying relevant studies, screening titles and abstracts, and extracting structured data&#x2014;involve repetitive, rule-based tasks that are well-suited to automation using AI. With the ongoing development of ML tools and the introduction of LLMs, researchers now have a large suite of AI-based automation tools available to them, as highlighted in our review. While some tools are yet to be comprehensively evaluated, validated, and compared, given the risks generative AI poses to evidence synthesis integrity, the evidence base highlights many potential avenues to improve manual workload. As we continue to integrate such tools into research workflows, it is vital that we continue to monitor outcomes, maintain transparency, and always have a researcher-in-the-loop to ensure that evidence reviews continue to be of a high standard.</p></sec></sec></body><back><ack><p>The Health Foundation was not involved in the design and methodology of this review. They provided final comments to the paper that were incorporated into the manuscript. The authors would like to acknowledge Starry Chow (The University of Hong Kong&#x2014;Li Ka Shing Faculty of Medicine) for their input in translation during the full-text screening process. Generative artificial intelligence was not used to inform this review&#x2019;s design or methodology or generate any text or figures. It was only used to refine writing for grammar and clarity.</p></ack><notes><sec><title>Funding</title><p>"An overview and evaluation of the use of automated tools in rapid evidence synthesis" project is supported by the Health Foundation, an independent charitable organization working to build a healthier United Kingdom (reference FR-0006738).</p></sec><sec><title>Data Availability</title><p>Data sharing is not applicable to this article as no datasets were generated or analyzed during this study.</p></sec></notes><fn-group><fn fn-type="con"><p>Conceptualization: SH (equal), JF (lead)</p><p>Data curation: SH (equal), H Pearce (equal), CA (supporting), IK (supporting)</p><p>Formal analysis: SH (lead), JF (supporting)</p><p>Funding acquisition: JF</p><p>Investigation: SH (lead), H Pearce (equal), IK (supporting)</p><p>Methodology: SH (lead), JF (equal)</p><p>Project administration: SH (lead), JF (equal)</p><p>Supervision: JF</p><p>Visualization: SH (lead), H Pearce (supporting), JF (supporting)</p><p>Writing &#x2013; original draft: SH (lead)</p><p>Writing &#x2013; review &#x0026; editing: H Pearce (supporting), H Painter (supporting), CA (supporting), LL (supporting), IK (supporting), JK (supporting), AC (supporting), JF (equal)</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item><def-item><term id="abb2">BERT</term><def><p>Bidirectional Encoder Representations from Transformers</p></def></def-item><def-item><term id="abb3">GPT</term><def><p>generative pretrained transformer</p></def></def-item><def-item><term id="abb4">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb5">ML</term><def><p>machine learning</p></def></def-item><def-item><term id="abb6">PRISMA-ScR</term><def><p>Preferred Reporting Items for Systematic Reviews and Meta-Analyses extension for Scoping Reviews</p></def></def-item><def-item><term id="abb7">WSS</term><def><p>Work Saved Over Sampling</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalil</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tamara</surname><given-names>L</given-names> </name><name name-style="western"><surname>Rada</surname><given-names>G</given-names> </name><name name-style="western"><surname>Akl</surname><given-names>EA</given-names> </name></person-group><article-title>Challenges of evidence synthesis during the 2020 COVID pandemic: a scoping review</article-title><source>J Clin Epidemiol</source><year>2022</year><month>02</month><volume>142</volume><fpage>10</fpage><lpage>18</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.10.017</pub-id><pub-id pub-id-type="medline">34718121</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tercero-Hidalgo</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Khan</surname><given-names>KS</given-names> </name><name name-style="western"><surname>Bueno-Cavanillas</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Artificial intelligence in COVID-19 evidence syntheses was underutilized, but impactful: a methodological study</article-title><source>J Clin Epidemiol</source><year>2022</year><month>08</month><volume>148</volume><fpage>124</fpage><lpage>134</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2022.04.027</pub-id><pub-id pub-id-type="medline">35513213</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Borah</surname><given-names>R</given-names> </name><name name-style="western"><surname>Brown</surname><given-names>AW</given-names> </name><name name-style="western"><surname>Capers</surname><given-names>PL</given-names> </name><name name-style="western"><surname>Kaiser</surname><given-names>KA</given-names> </name></person-group><article-title>Analysis of the time and workers needed to conduct systematic reviews of medical interventions using data from the PROSPERO registry</article-title><source>BMJ Open</source><year>2017</year><month>02</month><day>27</day><volume>7</volume><issue>2</issue><fpage>e012545</fpage><pub-id pub-id-type="doi">10.1136/bmjopen-2016-012545</pub-id><pub-id pub-id-type="medline">28242767</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>J</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Del Mar</surname><given-names>C</given-names> </name><name name-style="western"><surname>Bannach-Brown</surname><given-names>A</given-names> </name><name name-style="western"><surname>Stehlik</surname><given-names>P</given-names> </name><name name-style="western"><surname>Scott</surname><given-names>AM</given-names> </name></person-group><article-title>A full systematic review was completed in 2 weeks using automation tools: a case study</article-title><source>J Clin Epidemiol</source><year>2020</year><month>05</month><volume>121</volume><fpage>81</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.01.008</pub-id><pub-id pub-id-type="medline">32004673</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>J</given-names> </name><name name-style="western"><surname>McFarlane</surname><given-names>C</given-names> </name><name name-style="western"><surname>Cleo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Ishikawa Ramos</surname><given-names>C</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>S</given-names> </name></person-group><article-title>The impact of systematic review automation tools on methodological quality and time taken to complete systematic review tasks: case study</article-title><source>JMIR Med Educ</source><year>2021</year><month>05</month><day>31</day><volume>7</volume><issue>2</issue><fpage>e24418</fpage><pub-id pub-id-type="doi">10.2196/24418</pub-id><pub-id pub-id-type="medline">34057072</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abogunrin</surname><given-names>S</given-names> </name><name name-style="western"><surname>Muir</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Zerbini</surname><given-names>C</given-names> </name><name name-style="western"><surname>Sarri</surname><given-names>G</given-names> </name></person-group><article-title>How much can we save by applying artificial intelligence in evidence synthesis? Results from a pragmatic review to quantify workload efficiencies and cost savings</article-title><source>Front Pharmacol</source><year>2025</year><volume>16</volume><fpage>1454245</fpage><pub-id pub-id-type="doi">10.3389/fphar.2025.1454245</pub-id><pub-id pub-id-type="medline">39959426</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rogers</surname><given-names>K</given-names> </name><name name-style="western"><surname>Miller</surname><given-names>A</given-names> </name><name name-style="western"><surname>Girgis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>EC</given-names> </name><name name-style="western"><surname>Neil-Sztramko</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Dobbins</surname><given-names>M</given-names> </name></person-group><article-title>Leveraging AI to optimize maintenance of health evidence and offer a one-stop shop for quality-appraised evidence syntheses on the effectiveness of public health interventions: quality improvement project</article-title><source>J Med Internet Res</source><year>2025</year><month>07</month><day>29</day><volume>27</volume><issue>1</issue><fpage>e69700</fpage><pub-id pub-id-type="doi">10.2196/69700</pub-id><pub-id pub-id-type="medline">40729661</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ofori-Boateng</surname><given-names>R</given-names> </name><name name-style="western"><surname>Aceves-Martins</surname><given-names>M</given-names> </name><name name-style="western"><surname>Wiratunga</surname><given-names>N</given-names> </name><name name-style="western"><surname>Moreno-Garcia</surname><given-names>CF</given-names> </name></person-group><article-title>Towards the automation of systematic reviews using natural language processing, machine learning, and deep learning: a comprehensive review</article-title><source>Artif Intell Rev</source><year>2024</year><volume>57</volume><issue>8</issue><fpage>200</fpage><pub-id pub-id-type="doi">10.1007/s10462-024-10844-w</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Lai</surname><given-names>H</given-names> </name><name name-style="western"><surname>Zhao</surname><given-names>W</given-names> </name><etal/></person-group><article-title>AI-driven evidence synthesis: data extraction of randomized controlled trials with large language models</article-title><source>Int J Surg</source><year>2025</year><volume>111</volume><issue>3</issue><fpage>2722</fpage><lpage>2726</lpage><pub-id pub-id-type="doi">10.1097/JS9.0000000000002215</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cao</surname><given-names>L</given-names> </name><name name-style="western"><surname>Danek</surname><given-names>B</given-names> </name><name name-style="western"><surname>Jin</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name></person-group><article-title>Accelerating clinical evidence synthesis with large language models</article-title><source>NPJ Digit Med</source><year>2025</year><month>08</month><day>8</day><volume>8</volume><issue>1</issue><fpage>509</fpage><pub-id pub-id-type="doi">10.1038/s41746-025-01840-7</pub-id><pub-id pub-id-type="medline">40775042</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Clark</surname><given-names>J</given-names> </name><name name-style="western"><surname>Barton</surname><given-names>B</given-names> </name><name name-style="western"><surname>Albarqouni</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Generative artificial intelligence use in evidence synthesis: a systematic review</article-title><source>Res synth methods</source><year>2025</year><month>07</month><volume>16</volume><issue>4</issue><fpage>601</fpage><lpage>619</lpage><pub-id pub-id-type="doi">10.1017/rsm.2025.16</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lieberum</surname><given-names>JL</given-names> </name><name name-style="western"><surname>Toews</surname><given-names>M</given-names> </name><name name-style="western"><surname>Metzendorf</surname><given-names>MI</given-names> </name><etal/></person-group><article-title>Large language models for conducting systematic reviews: on the rise, but not yet ready for use-a scoping review</article-title><source>J Clin Epidemiol</source><year>2025</year><month>05</month><volume>181</volume><fpage>111746</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2025.111746</pub-id><pub-id pub-id-type="medline">40021099</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Siemens</surname><given-names>W</given-names> </name><name name-style="western"><surname>von Elm</surname><given-names>E</given-names> </name><name name-style="western"><surname>Binder</surname><given-names>H</given-names> </name><etal/></person-group><article-title>Opportunities, challenges and risks of using artificial intelligence for evidence synthesis</article-title><source>BMJ Evid Based Med</source><year>2025</year><month>12</month><day>1</day><volume>30</volume><issue>6</issue><fpage>381</fpage><lpage>384</lpage><pub-id pub-id-type="doi">10.1136/bmjebm-2024-113320</pub-id><pub-id pub-id-type="medline">39788693</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ong</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Merle</surname><given-names>DA</given-names> </name><name name-style="western"><surname>Wagner</surname><given-names>SK</given-names> </name><name name-style="western"><surname>Keane</surname><given-names>PA</given-names> </name></person-group><article-title>Exploring the dilemma of AI use in medical research and knowledge synthesis: a perspective on deep research tools</article-title><source>J Med Internet Res</source><year>2025</year><month>07</month><day>15</day><volume>27</volume><issue>1</issue><fpage>e75666</fpage><pub-id pub-id-type="doi">10.2196/75666</pub-id><pub-id pub-id-type="medline">40663724</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalil</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zarnegar</surname><given-names>A</given-names> </name></person-group><article-title>Tools to support the automation of systematic reviews: a scoping review</article-title><source>J Clin Epidemiol</source><year>2022</year><month>04</month><volume>144</volume><fpage>22</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.12.005</pub-id><pub-id pub-id-type="medline">34896236</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cierco Jimenez</surname><given-names>R</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>T</given-names> </name><name name-style="western"><surname>Rosillo</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Machine learning computational tools to assist the performance of systematic reviews: a mapping review</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>12</month><day>16</day><volume>22</volume><issue>1</issue><fpage>322</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01805-4</pub-id><pub-id pub-id-type="medline">36522637</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Roth</surname><given-names>S</given-names> </name><name name-style="western"><surname>Wermer-Colan</surname><given-names>A</given-names> </name></person-group><article-title>Machine learning methods for systematic reviews:: a rapid scoping review</article-title><source>Dela J Public Health</source><year>2023</year><month>11</month><volume>9</volume><issue>4</issue><fpage>40</fpage><lpage>47</lpage><pub-id pub-id-type="doi">10.32481/djph.2023.11.008</pub-id><pub-id pub-id-type="medline">38173960</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalil</surname><given-names>H</given-names> </name><name name-style="western"><surname>Pollock</surname><given-names>D</given-names> </name><name name-style="western"><surname>McInerney</surname><given-names>P</given-names> </name><etal/></person-group><article-title>Automation tools to support undertaking scoping reviews</article-title><source>Res Synth Methods</source><year>2024</year><month>11</month><volume>15</volume><issue>6</issue><fpage>839</fpage><lpage>850</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1731</pub-id><pub-id pub-id-type="medline">38885942</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Peters</surname><given-names>MDJ</given-names> </name><name name-style="western"><surname>Marnie</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tricco</surname><given-names>AC</given-names> </name><etal/></person-group><article-title>Updated methodological guidance for the conduct of scoping reviews</article-title><source>JBI Evid Synth</source><year>2020</year><month>10</month><volume>18</volume><issue>10</issue><fpage>2119</fpage><lpage>2126</lpage><pub-id pub-id-type="doi">10.11124/JBIES-20-00167</pub-id><pub-id pub-id-type="medline">33038124</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tricco</surname><given-names>AC</given-names> </name><name name-style="western"><surname>Lillie</surname><given-names>E</given-names> </name><name name-style="western"><surname>Zarin</surname><given-names>W</given-names> </name><etal/></person-group><article-title>PRISMA extension for scoping reviews (PRISMA-ScR): checklist and explanation</article-title><source>Ann Intern Med</source><year>2018</year><month>10</month><day>2</day><volume>169</volume><issue>7</issue><fpage>467</fpage><lpage>473</lpage><pub-id pub-id-type="doi">10.7326/M18-0850</pub-id><pub-id pub-id-type="medline">30178033</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="web"><source>Litmaps</source><access-date>2025-05-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.litmaps.com/">https://www.litmaps.com/</ext-link></comment></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>A pro-innovation approach to AI regulation</article-title><source>GOV.UK</source><access-date>2024-12-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper">https://www.gov.uk/government/publications/ai-regulation-a-pro-innovation-approach/white-paper</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>Evidence synthesis - what is it and why do we need it?</article-title><source>Cochrane</source><year>2019</year><month>09</month><day>13</day><access-date>2024-12-20</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cochrane.org/news/evidence-synthesis-what-it-and-why-do-we-need-it">https://www.cochrane.org/news/evidence-synthesis-what-it-and-why-do-we-need-it</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="web"><source>EndNote</source><access-date>2025-05-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://endnote.com/">https://endnote.com/</ext-link></comment></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rayyan</surname><given-names>OM</given-names> </name></person-group><article-title>AI and the evolution of journalistic practices</article-title><source>J Inf Stud Technol</source><year>2025</year><month>07</month><day>31</day><volume>2</volume><pub-id pub-id-type="doi">10.5339/jist.2025.15</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cardoso Sampaio</surname><given-names>R</given-names> </name><name name-style="western"><surname>Chagas</surname><given-names>V</given-names> </name><name name-style="western"><surname>Sinimbu Sanchez</surname><given-names>C</given-names> </name></person-group><article-title>An artificial intelligence (AI)-assisted scoping review of emerging uses of AI in qualitative research and its ethical considerations</article-title><source>Rev Pesq Qual</source><year>2024</year><volume>12</volume><issue>30</issue><fpage>01</fpage><lpage>28</lpage><pub-id pub-id-type="doi">10.33361/RPQ.2024.v.12.n.30.729</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Teperikidis</surname><given-names>E</given-names> </name><name name-style="western"><surname>Boulmpou</surname><given-names>A</given-names> </name><name name-style="western"><surname>Potoupni</surname><given-names>V</given-names> </name><name name-style="western"><surname>Kundu</surname><given-names>S</given-names> </name><name name-style="western"><surname>Singh</surname><given-names>B</given-names> </name><name name-style="western"><surname>Papadopoulos</surname><given-names>C</given-names> </name></person-group><article-title>Does the long-term administration of proton pump inhibitors increase the risk of adverse cardiovascular outcomes? A ChatGPT powered umbrella review</article-title><source>Acta Cardiol</source><year>2023</year><month>11</month><volume>78</volume><issue>9</issue><fpage>980</fpage><lpage>988</lpage><pub-id pub-id-type="doi">10.1080/00015385.2023.2231299</pub-id><pub-id pub-id-type="medline">37431972</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Guo</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>W</given-names> </name><name name-style="western"><surname>Deng</surname><given-names>H</given-names> </name></person-group><article-title>Application of automated literature screening tools in systematic reviews</article-title><source>Med J Peking Union Med Coll Hosp</source><year>2024</year><volume>15</volume><issue>4</issue><fpage>921</fpage><lpage>926</lpage><pub-id pub-id-type="doi">10.12290/xhyxzz.2023-0257</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Esposito</surname><given-names>C</given-names> </name><name name-style="western"><surname>Dell&#x2019;Omo</surname><given-names>G</given-names> </name><name name-style="western"><surname>Di Ianni</surname><given-names>D</given-names> </name><name name-style="western"><surname>Di Procolo</surname><given-names>P</given-names> </name></person-group><article-title>Human vs. ChatGPT. Is it possible obtain comparable results in the analysis of a scientific systematic review?</article-title><source>Recenti Prog Med</source><year>2024</year><month>09</month><volume>115</volume><issue>9</issue><fpage>420</fpage><lpage>425</lpage><pub-id pub-id-type="doi">10.1701/4334.43184</pub-id><pub-id pub-id-type="medline">39269357</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>AlSagri</surname><given-names>HS</given-names> </name><name name-style="western"><surname>Farhat</surname><given-names>F</given-names> </name><name name-style="western"><surname>Sohail</surname><given-names>SS</given-names> </name><name name-style="western"><surname>Saudagar</surname><given-names>AKJ</given-names> </name></person-group><article-title>ChatGPT or Gemini: who makes the better scientific writing assistant?</article-title><source>J Acad Ethics</source><year>2025</year><month>09</month><volume>23</volume><issue>3</issue><fpage>1121</fpage><lpage>1135</lpage><pub-id pub-id-type="doi">10.1007/s10805-024-09549-0</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>De Cassai</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dost</surname><given-names>B</given-names> </name><name name-style="western"><surname>Karapinar</surname><given-names>YE</given-names> </name><etal/></person-group><article-title>Evaluating the utility of large language models in generating search strings for systematic reviews in anesthesiology: a comparative analysis of top-ranked journals</article-title><source>Reg Anesth Pain Med</source><year>2025</year><month>01</month><day>19</day><volume>Published online January 2025</volume><fpage>4</fpage><pub-id pub-id-type="doi">10.1136/rapm-2024-106231</pub-id><pub-id pub-id-type="medline">39828514</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Delgado-Chaves</surname><given-names>FM</given-names> </name><name name-style="western"><surname>Jennings</surname><given-names>MJ</given-names> </name><name name-style="western"><surname>Atalaia</surname><given-names>A</given-names> </name><etal/></person-group><article-title>Transforming literature screening: the emerging role of large language models in systematic reviews</article-title><source>Proc Natl Acad Sci U S A</source><year>2025</year><month>01</month><day>14</day><volume>122</volume><issue>2</issue><fpage>e2411962122</fpage><pub-id pub-id-type="doi">10.1073/pnas.2411962122</pub-id><pub-id pub-id-type="medline">39761403</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Konet</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>I</given-names> </name><name name-style="western"><surname>Gartlehner</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Performance of two large language models for data extraction in evidence synthesis</article-title><source>Res Synth Methods</source><year>2024</year><month>09</month><volume>15</volume><issue>5</issue><fpage>818</fpage><lpage>824</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1732</pub-id><pub-id pub-id-type="medline">38895747</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>M</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>X</given-names> </name></person-group><article-title>Evaluating the effectiveness of large language models in abstract screening: a comparative analysis</article-title><source>Syst Rev</source><year>2024</year><month>08</month><day>21</day><volume>13</volume><issue>1</issue><fpage>219</fpage><pub-id pub-id-type="doi">10.1186/s13643-024-02609-x</pub-id><pub-id pub-id-type="medline">39169386</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Omar</surname><given-names>M</given-names> </name><name name-style="western"><surname>Nassar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hijazi</surname><given-names>K</given-names> </name><name name-style="western"><surname>Glicksberg</surname><given-names>BS</given-names> </name><name name-style="western"><surname>Nadkarni</surname><given-names>GN</given-names> </name><name name-style="western"><surname>Klang</surname><given-names>E</given-names> </name></person-group><article-title>Generating credible referenced medical research: a comparative study of openAI&#x2019;s GPT-4 and Google&#x2019;s gemini</article-title><source>Comput Biol Med</source><year>2025</year><month>02</month><volume>185</volume><fpage>109545</fpage><pub-id pub-id-type="doi">10.1016/j.compbiomed.2024.109545</pub-id><pub-id pub-id-type="medline">39667055</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rathi</surname><given-names>H</given-names> </name><name name-style="western"><surname>Malik</surname><given-names>A</given-names> </name><name name-style="western"><surname>Behera</surname><given-names>DC</given-names> </name><name name-style="western"><surname>Kamboj</surname><given-names>G</given-names> </name></person-group><article-title>Msr28 use of large language model (LLM) for full-text screening in systematic literature reviews: a comparative analysis</article-title><source>Value Health</source><year>2024</year><month>06</month><volume>27</volume><issue>6</issue><fpage>S264</fpage><pub-id pub-id-type="doi">10.1016/j.jval.2024.03.1461</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Sanghera</surname><given-names>R</given-names> </name><name name-style="western"><surname>Thirunavukarasu</surname><given-names>AJ</given-names> </name><name name-style="western"><surname>Khoury</surname><given-names>ME</given-names> </name><etal/></person-group><article-title>High-performance automated abstract screening with large language model ensembles</article-title><source>arXiv</source><comment>Preprint posted online on  Nov 22, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2411.02451</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>&#x0160;uster</surname><given-names>S</given-names> </name><name name-style="western"><surname>Baldwin</surname><given-names>T</given-names> </name><name name-style="western"><surname>Verspoor</surname><given-names>K</given-names> </name></person-group><article-title>Zero- and few-shot prompting of generative large language models provides weak assessment of risk of bias in clinical trials</article-title><source>Res Synth Methods</source><year>2024</year><month>11</month><volume>15</volume><issue>6</issue><fpage>988</fpage><lpage>1000</lpage><pub-id pub-id-type="doi">10.1002/jrsm.1749</pub-id><pub-id pub-id-type="medline">39176994</pub-id></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Thelwall</surname><given-names>M</given-names> </name></person-group><article-title>Is Google Gemini better than ChatGPT at evaluating research quality?</article-title><source>J. Data Inf. Sci.</source><year>2025</year><month>04</month><day>1</day><volume>10</volume><issue>2</issue><fpage>1</fpage><lpage>5</lpage><pub-id pub-id-type="doi">10.2478/jdis-2025-0014</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Trad</surname><given-names>F</given-names> </name><name name-style="western"><surname>Charafeddine</surname><given-names>J</given-names> </name><name name-style="western"><surname>Chkahtoura</surname><given-names>M</given-names> </name><name name-style="western"><surname>Rahme</surname><given-names>M</given-names> </name><name name-style="western"><surname>Fuleihan</surname><given-names>GEH</given-names> </name><name name-style="western"><surname>Chehab</surname><given-names>A</given-names> </name></person-group><article-title>Streamlining systematic reviews in medical research: a novel application of large language models</article-title><source>J Bone Miner Res</source><year>2024</year><volume>39</volume><issue>Supplement 1</issue><pub-id pub-id-type="doi">10.1093/jbmr/zjae038</pub-id><pub-id pub-id-type="medline">38477812</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Arno</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>B</given-names> </name><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>McKenzie</surname><given-names>JE</given-names> </name><name name-style="western"><surname>Elliott</surname><given-names>JH</given-names> </name></person-group><article-title>Accuracy and efficiency of machine learning-assisted risk-of-bias assessments in &#x201C;real-world&#x201D; systematic reviews: a noninferiority randomized controlled trial</article-title><source>Ann Intern Med</source><year>2022</year><month>07</month><volume>175</volume><issue>7</issue><fpage>1001</fpage><lpage>1009</lpage><pub-id pub-id-type="doi">10.7326/M22-0092</pub-id><pub-id pub-id-type="medline">35635850</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Bravo</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bennetts</surname><given-names>L</given-names> </name><name name-style="western"><surname>Atanasov</surname><given-names>P</given-names> </name></person-group><article-title>Accelerating the early identification of relevant studies in title and abstract screening</article-title><conf-name>2021 International Symposium on Computer Science and Intelligent Controls (ISCSIC)</conf-name><conf-date>Nov 12-14, 2021</conf-date><pub-id pub-id-type="doi">10.1109/ISCSIC54682.2021.00034</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ferdinands</surname><given-names>G</given-names> </name><name name-style="western"><surname>Schram</surname><given-names>R</given-names> </name><name name-style="western"><surname>de Bruin</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Performance of active learning models for screening prioritization in systematic reviews: a simulation study into the Average Time to Discover relevant records</article-title><source>Syst Rev</source><year>2023</year><month>06</month><day>20</day><volume>12</volume><issue>1</issue><fpage>100</fpage><pub-id pub-id-type="doi">10.1186/s13643-023-02257-7</pub-id><pub-id pub-id-type="medline">37340494</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Yao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>MV</given-names> </name><name name-style="western"><surname>Su</surname><given-names>E</given-names> </name><name name-style="western"><surname>Flores Miranda</surname><given-names>A</given-names> </name><name name-style="western"><surname>Saha</surname><given-names>A</given-names> </name><name name-style="western"><surname>Sussman</surname><given-names>J</given-names> </name></person-group><article-title>Evaluating the efficacy of artificial intelligence tools for the automation of systematic reviews in cancer research: a systematic review</article-title><source>Cancer Epidemiol</source><year>2024</year><month>02</month><volume>88</volume><fpage>102511</fpage><pub-id pub-id-type="doi">10.1016/j.canep.2023.102511</pub-id><pub-id pub-id-type="medline">38071872</pub-id></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chai</surname><given-names>KEK</given-names> </name><name name-style="western"><surname>Lines</surname><given-names>RLJ</given-names> </name><name name-style="western"><surname>Gucciardi</surname><given-names>DF</given-names> </name><name name-style="western"><surname>Ng</surname><given-names>L</given-names> </name></person-group><article-title>Research Screener: a machine learning tool to semi-automate abstract screening for systematic reviews</article-title><source>Syst Rev</source><year>2021</year><month>04</month><day>1</day><volume>10</volume><issue>1</issue><fpage>93</fpage><pub-id pub-id-type="doi">10.1186/s13643-021-01635-3</pub-id><pub-id pub-id-type="medline">33795003</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van Dinter</surname><given-names>R</given-names> </name><name name-style="western"><surname>Catal</surname><given-names>C</given-names> </name><name name-style="western"><surname>Tekinerdogan</surname><given-names>B</given-names> </name></person-group><article-title>A multi-channel convolutional neural network approach to automate the citation screening process</article-title><source>Appl Soft Comput</source><year>2021</year><month>11</month><volume>112</volume><fpage>107765</fpage><pub-id pub-id-type="doi">10.1016/j.asoc.2021.107765</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Kusa</surname><given-names>W</given-names> </name><name name-style="western"><surname>Hanbury</surname><given-names>A</given-names> </name><name name-style="western"><surname>Knoth</surname><given-names>P</given-names> </name></person-group><article-title>Automation of citation screening for systematic literature reviews using neural networks: a replicability study</article-title><conf-name>Advances in Information Retrieval - 44th European Conference on (IR) Research, (ECIR) 2022</conf-name><conf-date>Apr 10-14, 2022</conf-date><pub-id pub-id-type="doi">10.1007/978-3-030-99736-6_39</pub-id></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Muthu</surname><given-names>S</given-names> </name></person-group><article-title>The efficiency of machine learning-assisted platform for article screening in systematic reviews in orthopaedics</article-title><source>Int Orthop</source><year>2023</year><month>02</month><volume>47</volume><issue>2</issue><fpage>551</fpage><lpage>556</lpage><pub-id pub-id-type="doi">10.1007/s00264-022-05672-y</pub-id><pub-id pub-id-type="medline">36562816</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akinseloyin</surname><given-names>O</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>XR</given-names> </name><name name-style="western"><surname>Palade</surname><given-names>V</given-names> </name></person-group><article-title>A question-answering framework for automated abstract screening using large language models</article-title><source>J Am Med Inform Assoc</source><year>2024</year><month>09</month><day>1</day><volume>31</volume><issue>9</issue><fpage>1939</fpage><lpage>1952</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocae166</pub-id><pub-id pub-id-type="medline">39042516</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Liang</surname><given-names>S</given-names> </name><name name-style="western"><surname>Zhang</surname><given-names>Y</given-names> </name><etal/></person-group><article-title>Automated medical literature screening using artificial intelligence: a systematic review and meta-analysis</article-title><source>J Am Med Inform Assoc</source><year>2022</year><month>07</month><day>12</day><volume>29</volume><issue>8</issue><fpage>1425</fpage><lpage>1432</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocac066</pub-id><pub-id pub-id-type="medline">35641139</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsou</surname><given-names>AY</given-names> </name><name name-style="western"><surname>Treadwell</surname><given-names>JR</given-names> </name><name name-style="western"><surname>Erinoff</surname><given-names>E</given-names> </name><name name-style="western"><surname>Schoelles</surname><given-names>K</given-names> </name></person-group><article-title>Machine learning for screening prioritization in systematic reviews: comparative performance of Abstrackr and EPPI-Reviewer</article-title><source>Syst Rev</source><year>2020</year><month>04</month><day>2</day><volume>9</volume><issue>1</issue><fpage>73</fpage><pub-id pub-id-type="doi">10.1186/s13643-020-01324-7</pub-id><pub-id pub-id-type="medline">32241297</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Jahan</surname><given-names>I</given-names> </name><name name-style="western"><surname>Laskar</surname><given-names>MTR</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>C</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>J</given-names> </name></person-group><article-title>Evaluation of ChatGPT on biomedical tasks: a zero-shot comparison with fine-tuned generative transformers</article-title><source>arXiv</source><comment>Preprint posted online on  Aug 24, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.10198</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhong</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Ding</surname><given-names>L</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>J</given-names> </name><name name-style="western"><surname>Du</surname><given-names>B</given-names> </name><name name-style="western"><surname>Tao</surname><given-names>D</given-names> </name></person-group><article-title>Can ChatGPT understand too? A comparative study on ChatGPT and fine-tuned BERT</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 2, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2302.10198</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Draxler</surname><given-names>F</given-names> </name><name name-style="western"><surname>Werner</surname><given-names>A</given-names> </name><name name-style="western"><surname>Lehmann</surname><given-names>F</given-names> </name><etal/></person-group><article-title>The AI ghostwriter effect: when users do not perceive ownership of AI-generated text but self-declare as authors</article-title><source>ACM Trans Comput-Hum Interact</source><year>2024</year><month>04</month><day>30</day><volume>31</volume><issue>2</issue><fpage>1</fpage><lpage>40</lpage><pub-id pub-id-type="doi">10.1145/3637875</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hosseini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gordijn</surname><given-names>B</given-names> </name><name name-style="western"><surname>Kaebnick</surname><given-names>GE</given-names> </name><name name-style="western"><surname>Holmes</surname><given-names>K</given-names> </name></person-group><article-title>Disclosing generative AI use for writing assistance should be voluntary</article-title><source>Res Ethics</source><year>2025</year><month>06</month><day>21</day><volume>21</volume><issue>4</issue><fpage>728</fpage><lpage>735</lpage><pub-id pub-id-type="doi">10.1177/17470161251345499</pub-id><pub-id pub-id-type="medline">40950847</pub-id></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Cohen</surname><given-names>JF</given-names> </name><name name-style="western"><surname>Moher</surname><given-names>D</given-names> </name></person-group><article-title>Generative artificial intelligence and academic writing: friend or foe?</article-title><source>J Clin Epidemiol</source><year>2025</year><month>03</month><volume>179</volume><fpage>111646</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2024.111646</pub-id><pub-id pub-id-type="medline">39706536</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Formosa</surname><given-names>P</given-names> </name><name name-style="western"><surname>Bankins</surname><given-names>S</given-names> </name><name name-style="western"><surname>Matulionyte</surname><given-names>R</given-names> </name><name name-style="western"><surname>Ghasemi</surname><given-names>O</given-names> </name></person-group><article-title>Can ChatGPT be an author? Generative AI creative writing assistance and perceptions of authorship, creatorship, responsibility, and disclosure</article-title><source>AI &#x0026; Soc</source><year>2025</year><month>06</month><volume>40</volume><issue>5</issue><fpage>3405</fpage><lpage>3417</lpage><pub-id pub-id-type="doi">10.1007/s00146-024-02081-0</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dai</surname><given-names>ZY</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>FQ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>C</given-names> </name><etal/></person-group><article-title>Accuracy of large language models for literature screening in thoracic surgery: diagnostic study</article-title><source>J Med Internet Res</source><year>2025</year><month>03</month><day>11</day><volume>27</volume><fpage>e67488</fpage><pub-id pub-id-type="doi">10.2196/67488</pub-id><pub-id pub-id-type="medline">40068152</pub-id></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Datta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rastegar-Mojarad</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Enhancing systematic literature reviews with generative artificial intelligence: development, applications, and performance evaluation</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>04</month><day>1</day><volume>32</volume><issue>4</issue><fpage>616</fpage><lpage>625</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf030</pub-id><pub-id pub-id-type="medline">40036547</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="web"><source>International Collaboration for the Automation of Systematic Reviews</source><access-date>2025-05-12</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://icasr.github.io/">https://icasr.github.io/</ext-link></comment></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Flemyng</surname><given-names>E</given-names> </name><name name-style="western"><surname>Noel-Storr</surname><given-names>A</given-names> </name></person-group><article-title>Responsible AI in Evidence Synthesis (RAISE): guidance and recommendations</article-title><source>Open Science Framework</source><year>2024</year><month>08</month><day>29</day><access-date>2025-04-24</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://osf.io/fwaud/">https://osf.io/fwaud/</ext-link></comment></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rillig</surname><given-names>MC</given-names> </name><name name-style="western"><surname>&#x00C5;gerstrand</surname><given-names>M</given-names> </name><name name-style="western"><surname>Bi</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gould</surname><given-names>KA</given-names> </name><name name-style="western"><surname>Sauerland</surname><given-names>U</given-names> </name></person-group><article-title>Risks and benefits of large language models for the environment</article-title><source>Environ Sci Technol</source><year>2023</year><month>03</month><day>7</day><volume>57</volume><issue>9</issue><fpage>3464</fpage><lpage>3466</lpage><pub-id pub-id-type="doi">10.1021/acs.est.3c01106</pub-id><pub-id pub-id-type="medline">36821477</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ji</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>M</given-names> </name></person-group><article-title>A systematic review of electricity demand for large language models: evaluations, challenges, and solutions</article-title><source>Renew Sustain Energy Rev</source><year>2026</year><month>01</month><volume>225</volume><fpage>116159</fpage><pub-id pub-id-type="doi">10.1016/j.rser.2025.116159</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hosseini</surname><given-names>M</given-names> </name><name name-style="western"><surname>Gao</surname><given-names>P</given-names> </name><name name-style="western"><surname>Vivas-Valencia</surname><given-names>C</given-names> </name></person-group><article-title>A social-environmental impact perspective of generative artificial intelligence</article-title><source>Environ Sci Ecotechnol</source><year>2025</year><month>01</month><volume>23</volume><fpage>100520</fpage><pub-id pub-id-type="doi">10.1016/j.ese.2024.100520</pub-id><pub-id pub-id-type="medline">39811486</pub-id></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Singh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Patel</surname><given-names>NP</given-names> </name><name name-style="western"><surname>Ehtesham</surname><given-names>A</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Khoei</surname><given-names>TT</given-names> </name></person-group><article-title>A survey of sustainability in large language models: applications, economics, and challenges</article-title><source>arXiv</source><comment>Preprint posted online on  Mar 29, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2412.04782</pub-id></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Supplementary materials including detailed database search strategies, PRISMA-S checklist, and categorized tables of AI tools and open-source models identified in the review.</p><media xlink:href="jmir_v28i1e81597_app1.docx" xlink:title="DOCX File, 49 KB"/></supplementary-material><supplementary-material id="app2"><label>Checklist 1</label><p>PRISMA-ScR checklist.</p><media xlink:href="jmir_v28i1e81597_app2.docx" xlink:title="DOCX File, 28 KB"/></supplementary-material></app-group></back></article>