<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v27i1e69700</article-id><article-id pub-id-type="doi">10.2196/69700</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>Leveraging AI to Optimize Maintenance of Health Evidence and Offer a One-Stop Shop for Quality-Appraised Evidence Syntheses on the Effectiveness of Public Health Interventions: Quality Improvement Project</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Rogers</surname><given-names>Kristin</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Miller</surname><given-names>Alanna</given-names></name><degrees>MPH</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Girgis</surname><given-names>Ashley</given-names></name><degrees>MHI</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Clark</surname><given-names>Emily C</given-names></name><degrees>MSc</degrees><xref ref-type="aff" rid="aff1">1</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Neil-Sztramko</surname><given-names>Sarah E</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>Dobbins</surname><given-names>Maureen</given-names></name><degrees>RN, PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff3">3</xref></contrib></contrib-group><aff id="aff1"><institution>National Collaborating Centre for Methods and Tools, School of Nursing, McMaster University</institution><addr-line>175 Longwood Road South, Suite 210A</addr-line><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Department of Health Research Methods, Evidence, and Impact, Faculty of Health Sciences, McMaster University</institution><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>School of Nursing, Faculty of Health Sciences, McMaster University</institution><addr-line>Hamilton</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Patel</surname><given-names>Dhavalkumar</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Ebrahim</surname><given-names>Mansoor Veliyathnadu</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Gandhi</surname><given-names>Meghal</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Hou</surname><given-names>Zhen</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Maureen Dobbins, RN, PhD, National Collaborating Centre for Methods and Tools, School of Nursing, McMaster University, 175 Longwood Road South, Suite 210A, Hamilton, ON, L8P0A1, Canada, 1 9055259140 ext 20450; <email>dobbinsm@mcmaster.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2025</year></pub-date><pub-date pub-type="epub"><day>29</day><month>7</month><year>2025</year></pub-date><volume>27</volume><elocation-id>e69700</elocation-id><history><date date-type="received"><day>06</day><month>12</month><year>2024</year></date><date date-type="rev-recd"><day>04</day><month>06</month><year>2025</year></date><date date-type="accepted"><day>05</day><month>06</month><year>2025</year></date></history><copyright-statement>&#x00A9; Kristin Rogers, Alanna Miller, Ashley Girgis, Emily C Clark, Sarah E Neil-Sztramko, Maureen Dobbins. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 29.7.2025. </copyright-statement><copyright-year>2025</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2025/1/e69700"/><abstract><sec><title>Background</title><p>Health Evidence provides access to quality appraisals for &#x003E;10,000 evidence syntheses on the effectiveness and cost-effectiveness of public health and health promotion interventions. Maintaining Health Evidence has become increasingly resource-intensive due to the exponential growth of published literature. Innovative screening methods using artificial intelligence (AI) can potentially improve efficiency.</p></sec><sec><title>Objective</title><p>The objectives of this project are to: (1) assess the ability of AI-assisted screening to correctly predict nonrelevant references at the title and abstract level and investigate the consistency of this performance over time, and (2) evaluate the impact of AI-assisted screening on the overall monthly manual screening set.</p></sec><sec sec-type="methods"><title>Methods</title><p>Training and testing were conducted using the DistillerSR AI Preview &#x0026; Rank feature. A set of manually screened references (n=43,273) was uploaded and used to train the AI feature and assign probability scores to each reference to predict relevance. A minimum threshold was established where the AI feature correctly identified all manually screened relevant references. The AI feature was tested on a separate set of references (n=72,686) from the May 2019 to April 2020 monthly searches. The testing set was used to determine an optimal threshold that ensured &#x003E;99% of relevant references would continue to be added to Health Evidence. The performance of AI-assisted screening at the title and abstract screening level was evaluated using recall, specificity, precision, negative predictive value, and the number of references removed by AI. The number and percentage of references removed by AI-assisted screening and the change in monthly manual screening time were estimated using an implementation reference set (n=272,253) from November 2020 to 2023.</p></sec><sec sec-type="results"><title>Results</title><p>The minimum threshold in the training set of references was 0.068, which correctly removed 37% (n=16,122) of nonrelevant references. Analysis of the testing set identified an optimal threshold of 0.17, which removed 51,706 (71.14%) references using AI-assisted screening. A slight decrease in recall between the 0.068 minimum threshold (99.68%) and the 0.17 optimal threshold (94.84%) was noted, resulting in four missed references included via manual screening at the full-text level. This was accompanied by an increase in specificity from 35.95% to 71.70%, doubling the proportion of references AI-assisted screening correctly predicted as not relevant. Over 3 years of implementation, the number of references requiring manual screening was reduced by 70%, reducing the time spent manually screening by an estimated 382 hours.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Given the magnitude of newly published peer-reviewed evidence, the curation of evidence supports decision makers in making informed decisions. AI-assisted screening can be an important tool to supplement manual screening and reduce the number of references that require manual screening, ensuring that the continued availability of curated high-quality synthesis evidence in public health is possible.</p></sec></abstract><kwd-group><kwd>machine learning</kwd><kwd>natural language processing</kwd><kwd>automation</kwd><kwd>title and abstract screening</kwd><kwd>text classification</kwd><kwd>database management</kwd><kwd>citation screening</kwd><kwd>methodology</kwd><kwd>systematic review</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background</title><p>Public health programs, services, and policies aim to promote health and prevent injury and disease across populations [<xref ref-type="bibr" rid="ref1">1</xref>]. Public health initiatives cover a breadth of topics, including water and air quality testing, infectious disease surveillance, vaccine provision, alcohol and tobacco sales legislation, and school nutrition programs. Public health practitioners and policy makers are expected to seek the best available evidence to inform decisions on implementing effective interventions to improve the health and well-being of communities and populations. Evidence-informed decision-making in public health involves using the best available evidence from research, local context, community or political preferences, and public health resources to improve health outcomes [<xref ref-type="bibr" rid="ref2">2</xref>]. The use of rigorous evidence syntheses is a key component in an evidence-informed approach [<xref ref-type="bibr" rid="ref3">3</xref>]. Synthesized evidence, such as systematic reviews, brings together findings from all studies on a specific research question to contribute to public health decision-making [<xref ref-type="bibr" rid="ref4">4</xref>]. Variability in the methodological quality of evidence syntheses, time constraints, and resource limitations in using synthesized evidence are barriers to achieving evidence-informed public health decisions [<xref ref-type="bibr" rid="ref5">5</xref>-<xref ref-type="bibr" rid="ref7">7</xref>]. In 2005, Health Evidence was established to address these challenges by facilitating access to high-quality evidence syntheses, offering public health practitioners a one-stop shop for preappraised syntheses relevant to public health [<xref ref-type="bibr" rid="ref8">8</xref>].</p></sec><sec id="s1-2"><title>Health Evidence</title><p>Health Evidence hosts over 10,000 quality appraisals of published evidence syntheses on the effectiveness and cost-effectiveness of public health and health promotion interventions [<xref ref-type="bibr" rid="ref8">8</xref>]. Over the past 10 years, over half a million users have accessed Health Evidence worldwide. To keep Health Evidence up to date with the most recent evidence syntheses, monthly database searches of MEDLINE, Embase, CINAHL, and PsycINFO, monthly hand searches of Cochrane Library, Health Systems Evidence, and ACCESSSS Smart Search, and annual database searches of BIOSIS, SPORTDiscus, and Sociological Abstracts are conducted. Duplicates are removed, and the titles and abstracts of all search results are screened manually for relevance. The full texts of all potentially relevant syntheses are screened using 5 criteria for inclusion (<xref ref-type="other" rid="box1">Textbox 1</xref>) [<xref ref-type="bibr" rid="ref9">9</xref>]. References that meet all 5 inclusion criteria are quality appraised by two independent raters using the Health Evidence Quality Assessment Tool, indexed with keywords, and added to the Health Evidence website [<xref ref-type="bibr" rid="ref10">10</xref>]. An overview of the Health Evidence workflow is illustrated in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>As public health spans a broad array of topics, the search strategy must also be broad to ensure all relevant public health reviews are captured. Previous developmental work at Health Evidence concluded that it is more efficient to search the published literature for systematic reviews and then screen for those relevant to public health, as opposed to developing a search strategy specific to all public health topics [<xref ref-type="bibr" rid="ref11">11</xref>]. The last two decades have seen exponential growth in the number of published evidence syntheses [<xref ref-type="bibr" rid="ref12">12</xref>-<xref ref-type="bibr" rid="ref15">15</xref>], leading to a notable increase in the search results retrieved from monthly and annual database searches for Health Evidence. From 2018 to 2023, the average monthly database search results increased by 47%, from 8829 to 13,007. As the volume of published evidence syntheses grows, updating Health Evidence has become increasingly resource-intensive. Innovative screening methods using artificial intelligence (AI) are a potential solution to reduce the number of references requiring manual screening and ensure that Health Evidence remains feasible to maintain.</p><boxed-text id="box1"><title> Health Evidence criteria.</title><list list-type="bullet" prefix-word="1"> <list-item><p>Is this a review paper?</p></list-item> <list-item><p>Is the review relevant to public health or health promotion practice?</p></list-item> <list-item><p>Is the effectiveness of an intervention, program, service, or policy the subject of the review?</p></list-item> <list-item><p>Is evidence on outcomes included?</p></list-item><list-item><p>Is the search strategy described?</p></list-item> </list></boxed-text><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Health Evidence monthly workflow and AI-assisted screening integration and analysis. AI: artificial intelligence.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v27i1e69700_fig01.png"/></fig></sec><sec id="s1-3"><title>Application of AI to Maintain Health Evidence</title><p>AI commonly refers to the interdisciplinary study and development of models engineered to perform varied levels of automation that would typically require human intelligence [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. Machine learning is a subset of AI that involves algorithms that autonomously learn patterns from the data they are trained on without being explicitly programmed [<xref ref-type="bibr" rid="ref16">16</xref>]. Natural language processing is closely tied to machine learning algorithms, which involve computer systems&#x2019; ability to automatically process human language by segmenting unstructured text into smaller chunks and applying natural language processing techniques until a desirable pipeline is achieved [<xref ref-type="bibr" rid="ref18">18</xref>]. The use of AI to semiautomate the screening of studies for relevance within the systematic review process has been described as promising [<xref ref-type="bibr" rid="ref19">19</xref>-<xref ref-type="bibr" rid="ref21">21</xref>].</p><p>In 2018, Health Evidence began investigating the use of AI to reduce the burden of manual title and abstract screening using an existing web-based review management platform, DistillerSR. This platform was selected based on the team&#x2019;s familiarity with the software, its low cost, and the availability of live technical support [<xref ref-type="bibr" rid="ref22">22</xref>]. The DistillerSR Preview &#x0026; Rank feature is a logistic regression classifier that learns from manual screening decisions by applying a supervised machine learning model through a support vector machine nonprobabilistic binary linear classifier [<xref ref-type="bibr" rid="ref23">23</xref>]. The Preview &#x0026; Rank feature is preconfigured, internally tuned, and uses language-modeling-based feature representation to learn from the language patterns provided in the training data. To prepare the training data, standard preprocessing steps are applied, including tokenization, stop-word removal, and normalization. By learning from the data provided by labeled manual screening decisions, this feature assigns references a score between 0 and 1 to predict the probability that the reference is relevant, with scores closer to 1 more likely to be relevant. These assigned probability scores can be used to test the performance of the AI feature using multiple thresholds and select an optimal threshold by assessing the proportion of correctly predicted references compared to manual screening results. The optimal threshold is defined as the probability score that correctly identifies relevant references while correctly removing the greatest number of nonrelevant references. Reviewers can then remove references with a score below the optimal threshold without manual review. To address known challenges in using AI to support reference screening, quality assurance testing, and continuous monitoring is recommended to ensure that the AI feature continues to function as expected. These challenges may include variability in the content of the monthly searches and AI concept drift, for example, when novel phenomena emerge that were not captured in the training set [<xref ref-type="bibr" rid="ref24">24</xref>]. The objectives of this project are to (1) assess the ability of AI-assisted screening to correctly predict nonrelevant references at the title and abstract level using an optimal threshold, and investigate the consistency of this performance over time; and (2) evaluate the impact of AI-assisted screening on the overall monthly manual screening set.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><sec id="s2-1"><title>Ethical Considerations</title><p>As this is a quality improvement project that does not include participants or participant data, an ethics review was not required per the guidelines of the Hamilton Integrated Research Ethics Board [<xref ref-type="bibr" rid="ref25">25</xref>].</p></sec><sec id="s2-2"><title>Datasets</title><p>Four types of reference sets from various database searches were used in this project: the &#x201C;AI training set,&#x201D; the &#x201C;AI testing set,&#x201D; the &#x201C;quality assurance sets,&#x201D; and the &#x201C;implementation set.&#x201D; The following sections describe these datasets in detail.</p></sec><sec id="s2-3"><title>Objective 1: Assessing the Ability of AI-Assisted Screening to Predict Nonrelevant References and Investigating Consistency in Performance Over Time</title><sec id="s2-3-1"><title>Phase 1: Training the AI Feature</title><p>The &#x201C;AI training set&#x201D; (n=43,273) included all evidence syntheses included in Health Evidence from 2005 to 2018 (n=6742) and a set of references that were deemed not relevant through manual title and abstract screening from January to August 2018 (n=36,531). The size of the &#x201C;AI training set&#x201D; was limited by the processing capacity of DistillerSR at the time; thus, only a subset of nonrelevant references (from 2018) was included. The &#x201C;AI training set&#x201D; was uploaded to DistillerSR, and the data were used to train the AI feature and assign a probability score to each reference in the set. The range of probability scores assigned to references was examined to establish a minimum threshold by identifying the probability score at the title and abstract level, where all references were correctly identified as relevant. The proportion of references identified as not relevant by AI-assisted screening at the minimum threshold was recorded.</p></sec><sec id="s2-3-2"><title>Phase 2: Testing the AI Feature</title><p>The &#x201C;AI testing set&#x201D; included the manually screened search results from May 2019 to April 2020 (n=72,686). The &#x201C;AI testing set&#x201D; was uploaded to DistillerSR, and the trained AI feature described above was applied to assign probability scores to each reference prior to manual screening. Manual screening was conducted independently from the platform to ensure there was no indication of the manual screening result. This new set of references was used to test the performance of the AI-assisted screening feature by comparing (1) the results of the AI-assisted screening to the results of the manual title and abstract screening and (2) the number of missed references identified incorrectly as not relevant through AI-assisted screening to the references included at manual full-text screening and added to Health Evidence. Integration and analysis points in the Health Evidence workflow for comparing AI-assisted screening to manual screening are depicted in <xref ref-type="fig" rid="figure1">Figure 1</xref>.</p><p>A series of AI-screening tests were performed to identify the optimal threshold, defined as the threshold resulting in the highest number of references correctly predicted by AI as not relevant at the title and abstract level (ie, the maximum reduction in references to manually screen) with minimal references incorrectly excluded at the full-text screening level. References with a probability score below the optimal threshold can be removed without review, whereas references with a score equal to or greater than the optimal threshold require manual screening.</p><p>To examine the overall impact of these tested thresholds on title and abstract screening, performance was assessed by identifying true positives, true negatives, false positives, and false negatives. A true positive was a reference identified as relevant by AI-assisted and manual screening. A true negative was a reference identified as not relevant by AI-assisted and manual screening. False positive references were those identified by AI as relevant but assessed as not relevant in manual screening. False negative references were those identified by AI as not relevant but assessed as relevant in manual screening [<xref ref-type="bibr" rid="ref26">26</xref>]. These values were used to calculate recall (the proportion of references correctly identified by AI as relevant out of all manually relevant references, also referred to as sensitivity), specificity (the proportion of references correctly identified by AI as not relevant out of all manually not relevant references), precision (the proportion of references correctly identified by AI as relevant out of all references predicted as relevant), and negative predictive value (the proportion of references correctly identified by AI as not relevant out of all references predicted as not relevant). The absolute number of references identified as not relevant by AI-assisted screening at each tested threshold was used to estimate the reduction in the number of references that require manual screening.</p><p>As the goal of the project was to use AI-assisted screening to reduce the number of references requiring manual screening, while minimizing false negatives after full text review, the negative predictive value was prioritized over other values, such as <italic>F</italic><sub>1</sub>-score (which helps to identify a balance between precision and recall). A pragmatic approach was adopted to determine the optimal threshold to ensure that over 99% of all references added to Health Evidence through manual screening would have been captured if AI-assisted title and abstract screening had been integrated. Over 5 years (2015&#x2010;2019), the average number of references added to Health Evidence annually was 511, establishing an acceptable error rate of no more than five incorrectly excluded references (less than 1%) at the full-text screening level. The number of false negatives at both the title and abstract level and after manual full-text screening was calculated to determine the impact on the references that would ultimately be included in Health Evidence. Possible threshold scores were tested using the smallest probability score increments available on the platform (0.01), starting at the minimum threshold identified in Phase 1 up to five incorrectly excluded references. Incorrectly excluded references were examined to try to understand and explain patterns in what was ultimately added to Health Evidence at full-text screening but predicted as not relevant by AI-assisted title and abstract screening, for example, new phenomena that were not included in the &#x201C;AI training set&#x201D; (ie, COVID-19).</p><p>AI-assisted title and abstract screening using the optimal threshold established in the testing phase was then integrated into the Health Evidence monthly workflow. Implementation involved adapting the monthly workflow to upload the search results to the DistillerSR platform and apply the AI feature. The AI feature assigned probability scores to each uploaded reference, and the results were exported to EndNote (Clarivate). References with a probability score lower than the optimal threshold were removed from the screening set and not manually screened. Title and abstract, and full-text manual screening were conducted on the remaining references.</p></sec><sec id="s2-3-3"><title>Phase 3: Performing Quality Assurance Checks of the AI Feature</title><p>Following the implementation of AI-assisted screening in the Health Evidence monthly workflow, a series of quality assurance checks were completed to ensure that the AI feature continued to perform as expected over time and identify when the AI model may need to be retrained. Quality assurance sets included references from the full 2020 annual database searches (n=9759) and monthly searches in July 2022 (n=4160), September 2023 (n=4482), and October 2024 (n=5201). Each set was manually screened and compared to AI-assisted screening results using the optimal threshold identified. The absolute number of false negatives was recorded.</p></sec></sec><sec id="s2-4"><title>Objective 2: Evaluating the Impact of AI-Assisted Screening on Monthly Manual Screening</title><p>The &#x201C;implementation set&#x201D; (n=272,253) generated upon integrating AI-assisted screening into the Health Evidence monthly workflow was used to estimate the real-time impact of AI-assisted screening on monthly staff manual screening time. These included monthly search results from November 2020 to 2023. The number and percentage of references removed using the optimal threshold were recorded. A manual screening rate of 500 references per hour, based on 15 years of Health Evidence manual screening experience, was used to estimate the reduction in the number of hours of manual screening using AI-assisted screening to remove references below the optimal threshold.</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><sec id="s3-1"><title>Objective 1: Assessing the Ability of AI-Assisted Screening to Predict Nonrelevant References and Investigating Consistency in Performance Over Time</title><sec id="s3-1-1"><title>Phase 1: Findings From Training the AI Feature</title><p>Using the &#x201C;AI training set,&#x201D; the minimum threshold was identified as 0.068. At 0.068, the AI feature correctly identified all relevant references and 37% (n=16,122) of the nonrelevant references at the title and abstract screening level.</p></sec><sec id="s3-1-2"><title>Phase 2: Findings From Testing the AI Feature</title><p>Starting with the minimum threshold of 0.068, incremental threshold increases of 0.01 were assessed from 0.07 until five references incorrectly excluded at full-text manual screening were identified. Investigation of the five incorrectly excluded references found that three of these references were new phenomena (COVID-19) that did not exist in the &#x201C;AI training set.&#x201D; As such, those references were not considered AI-assisted screening errors, and incremental testing continued. To capture these types of new phenomena references, a separate hand search was implemented. Thresholds were tested, ranging from 0.07 to 0.18, at which time five references were incorrectly excluded and not explained by novel phenomena. Based on these findings, 0.17 was adopted as the optimal threshold. The overall performance (recall, specificity, precision, negative predictive value, and number removed by AI) of AI-assisted screening at the title and abstract screening level, and the number of false negative references at each screening level are reported in <xref ref-type="table" rid="table1">Table 1</xref>.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Performance of AI<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup>-assisted screening and number of false negatives in the &#x201C;AI testing set.&#x201D;</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom" rowspan="2">Threshold</td><td align="left" valign="bottom" rowspan="2">Recall, %</td><td align="left" valign="bottom" rowspan="2">Specificity, %</td><td align="left" valign="bottom" rowspan="2">Precision, %</td><td align="left" valign="bottom" rowspan="2">Negative predictive value, %</td><td align="left" valign="bottom" rowspan="2">Removed by AI, n</td><td align="left" valign="bottom" colspan="3">False negatives, n</td></tr><tr><td align="left" valign="bottom">Title and abstract</td><td align="left" valign="bottom">Full text</td><td align="left" valign="bottom">Full text<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup></td></tr></thead><tbody><tr><td align="left" valign="top">0.068</td><td align="left" valign="top">99.68</td><td align="left" valign="top">35.95</td><td align="left" valign="top">1.3</td><td align="left" valign="top">99.99</td><td align="left" valign="top">25,909</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">0.07</td><td align="left" valign="top">99.52</td><td align="left" valign="top">37.07</td><td align="left" valign="top">1.3</td><td align="left" valign="top">99.99</td><td align="left" valign="top">26,717</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">0.08</td><td align="left" valign="top">99.52</td><td align="left" valign="top">42.25</td><td align="left" valign="top">1.5</td><td align="left" valign="top">99.99</td><td align="left" valign="top">30,452</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td><td align="left" valign="top">0</td></tr><tr><td align="left" valign="top">0.09</td><td align="left" valign="top">99.19</td><td align="left" valign="top">46.98</td><td align="left" valign="top">1.6</td><td align="left" valign="top">99.99</td><td align="left" valign="top">33,859</td><td align="left" valign="top">5</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">0.1</td><td align="left" valign="top">98.87</td><td align="left" valign="top">51.13</td><td align="left" valign="top">1.7</td><td align="left" valign="top">99.98</td><td align="left" valign="top">36,857</td><td align="left" valign="top">7</td><td align="left" valign="top">2</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">0.11</td><td align="left" valign="top">98.39</td><td align="left" valign="top">55.11</td><td align="left" valign="top">1.9</td><td align="left" valign="top">99.97</td><td align="left" valign="top">39,724</td><td align="left" valign="top">10</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">0.12</td><td align="left" valign="top">97.74</td><td align="left" valign="top">58.62</td><td align="left" valign="top">2</td><td align="left" valign="top">99.97</td><td align="left" valign="top">42,258</td><td align="left" valign="top">14</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">0.13</td><td align="left" valign="top">96.94</td><td align="left" valign="top">61.78</td><td align="left" valign="top">2.1</td><td align="left" valign="top">99.96</td><td align="left" valign="top">44,540</td><td align="left" valign="top">19</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">0.14</td><td align="left" valign="top">96.45</td><td align="left" valign="top">64.53</td><td align="left" valign="top">2.3</td><td align="left" valign="top">99.95</td><td align="left" valign="top">46,528</td><td align="left" valign="top">22</td><td align="left" valign="top">3</td><td align="left" valign="top">1</td></tr><tr><td align="left" valign="top">0.15</td><td align="left" valign="top">95.97</td><td align="left" valign="top">67.20</td><td align="left" valign="top">2.5</td><td align="left" valign="top">99.95</td><td align="left" valign="top">48,455</td><td align="left" valign="top">25</td><td align="left" valign="top">4</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">0.16</td><td align="left" valign="top">95.48</td><td align="left" valign="top">69.61</td><td align="left" valign="top">2.6</td><td align="left" valign="top">99.94</td><td align="left" valign="top">50,196</td><td align="left" valign="top">28</td><td align="left" valign="top">4</td><td align="left" valign="top">2</td></tr><tr><td align="left" valign="top">0.17</td><td align="left" valign="top">94.84</td><td align="left" valign="top">71.70</td><td align="left" valign="top">2.8</td><td align="left" valign="top">99.94</td><td align="left" valign="top">51,706</td><td align="left" valign="top">32</td><td align="left" valign="top">7</td><td align="left" valign="top">4</td></tr><tr><td align="left" valign="top">0.18</td><td align="left" valign="top">94.35</td><td align="left" valign="top">73.64</td><td align="left" valign="top">3</td><td align="left" valign="top">99.93</td><td align="left" valign="top">53,108</td><td align="left" valign="top">35</td><td align="left" valign="top">9</td><td align="left" valign="top">6</td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>AI: artificial intelligence.</p></fn><fn id="table1fn2"><p><sup>b</sup>Adjusted number of false negative references at the full-text screening level, not considering new phenomena (ie, COVID-19).</p></fn></table-wrap-foot></table-wrap><p>A slight decrease in recall between the 0.068 minimum threshold (99.68%) and the 0.17 optimal threshold (94.84%) was noted, resulting in four missed references that were included at the full-text screening level using manual screening. This was accompanied by a substantial increase in specificity from 35.95% to 71.70%, doubling the proportion of references that AI-assisted screening correctly identified as not relevant and significantly reducing the number of search results requiring manual title and abstract screening by 51,706 (71.14%) references. These changes are reflected in the high negative predictive value, which remained above 99.9% at all thresholds tested. Precision was low, ranging from 1.3% to 3%, due to the high number of false positives that would subsequently undergo manual screening. This supports the need to continue conducting manual screening on the references identified as relevant by AI-assisted screening.</p><p>In a review of the four false negative references that were included through manual screening at the full-text level, it was determined that their exclusion related to types of intervention and population studied. For example, two focused on patients infected with HIV [<xref ref-type="bibr" rid="ref27">27</xref>,<xref ref-type="bibr" rid="ref28">28</xref>] and two on patients with type 2 diabetes mellitus [<xref ref-type="bibr" rid="ref29">29</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. Although technically meeting the broad Health Evidence topic criteria, their focus on secondary prevention in &#x201C;patient&#x201D; populations did not raise any concerns with their exclusion, as this is not generally a common focus area of local public health.</p><p>Starting in August 2020, the AI-assisted screening process using the 0.17 optimal threshold was integrated into the Health Evidence monthly workflow. Search results were uploaded to the DistillerSR platform monthly, and the AI feature was applied. References that scored below the optimal threshold were removed, and only the remaining references were screened manually.</p></sec><sec id="s3-1-3"><title>Phase 3: Findings From Quality Assurance Checks of the AI Feature</title><p>The quality assurance check on the 2020 annual database search results (n=9759) found that AI-assisted screening removed 6830 (70%) of references, with no references incorrectly excluded. In the subsets of the monthly searches for July 2022 (n=4160), September 2023 (n=4482), and October 2024 (n=5201), only one reference from the July 2022 monthly searches was incorrectly excluded by the AI-assisted screening and included at the full-text screening level. Similar to the testing phase, this missed reference examined secondary prevention in patients with type 2 diabetes mellitus. In general, the quality assurance sets showed consistency in the overall percentage of references removed, ranging from 69% to 70%. This indicates that the model&#x2019;s performance was maintained over a 3-year implementation period.</p></sec></sec><sec id="s3-2"><title>Objective 2: Evaluating the Impact of AI-Assisted Screening on Monthly Manual Screening</title><p>Over 3 full years of implementation in the Health Evidence monthly workflow, AI-assisted screening eliminated 70% (n=190,966) of the references identified in our database searches, resulting in 81,287 references that underwent manual screening. On average, the use of AI-assisted screening decreased the annual number of references requiring manual title and abstract screening from 90,751 to 27,096.</p><p>Implementing AI-assisted screening in the Health Evidence monthly workflow has considerably impacted the required staff time to screen references for Health Evidence. It reduced the time spent manually screening by an estimated 382 hours over a 3-year period. <xref ref-type="table" rid="table2">Table 2</xref> provides a specific breakdown of hours of screening time reduced per year.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Comparison of estimates of annual time needed to screen reference sets.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="top"/><td align="left" valign="top">Year 1 (hours)</td><td align="left" valign="top">Year 2 (hours)</td><td align="left" valign="top">Year 3 (hours)</td></tr></thead><tbody><tr><td align="left" valign="top">Estimated screening time without AI<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup>-assisted screening</td><td align="char" char="." valign="top">185.0</td><td align="char" char="." valign="top">181.8</td><td align="char" char="." valign="top">177.8</td></tr><tr><td align="left" valign="top">Estimated screening time with AI-assisted screening</td><td align="char" char="." valign="top">54.5</td><td align="char" char="." valign="top">54.7</td><td align="char" char="." valign="top">53.4</td></tr><tr><td align="left" valign="top">Estimated time saved per year</td><td align="char" char="." valign="top">130.5</td><td align="char" char="." valign="top">127.1</td><td align="char" char="." valign="top">124.4</td></tr><tr><td align="left" valign="top">Estimated cumulative time saved</td><td align="char" char="." valign="top">130.5</td><td align="char" char="." valign="top">257.6</td><td align="char" char="." valign="top">381.9</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>AI: artificial intelligence.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Principal Findings</title><p>This project demonstrates that AI-assisted screening is an efficient strategy to be used alongside manual title and abstract screening, removing as many as 70% of nonrelevant references while incorrectly excluding fewer than 1% of references annually. Given the very small percentage of studies incorrectly excluded and their focus on secondary prevention in specific &#x201C;patient&#x201D; populations, we consider the &#x201C;cost&#x201D; of missed references to have minimal impact on public health decision-making and worth the benefits of reduced references to screen for our evidence platform. This risk-benefit analysis will likely differ for other curated evidence platforms. To increase access to high-quality and timely evidence, curated evidence platforms on specific topic areas continue to grow in popularity. Examples include Epistemonikos [<xref ref-type="bibr" rid="ref31">31</xref>], Social Systems Evidence [<xref ref-type="bibr" rid="ref32">32</xref>], the Joanna Briggs Institute Evidence-Based Practice Database [<xref ref-type="bibr" rid="ref33">33</xref>], and Essential Evidence Plus [<xref ref-type="bibr" rid="ref34">34</xref>]. The exponential growth of peer-reviewed published evidence is a key challenge for maintaining these types of platforms. Using an already created AI feature reduced some of the upfront costs that would usually be involved in developing a new AI model. The successful implementation of AI-assisted screening to support the Health Evidence monthly workflow provides an example of a process from which others could learn in maintaining large databases of curated evidence using an existing AI tool.</p></sec><sec id="s4-2"><title>Comparison to Prior Work</title><p>To our knowledge, this is the first study to evaluate the suitability of a preexisting AI-assisted screening tool to maintain a curated evidence platform; however, two systematic reviews and one scoping review on different but related topics provide some comparative insights [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref35">35</xref>,<xref ref-type="bibr" rid="ref36">36</xref>]. Most applicable to this project&#x2019;s context, a 2021 systematic review included 10 papers assessing machine learning approaches to identify high-quality, clinically relevant evidence in the biomedical literature. Recall ranged from 9% to 98% but was generally above 85%, specificity ranged from 76% to 88%, and precision ranged from 9% to 86% [<xref ref-type="bibr" rid="ref35">35</xref>]. Looking at the use of AI in the conduct of systematic reviews, a 2015 systematic review included 44 papers assessing text mining approaches for study identification and reported a 30% to 70% workload savings with 95% recall [<xref ref-type="bibr" rid="ref21">21</xref>]. Finally, a more recent 2022 scoping review of 47 papers assessed the role of automation throughout the systematic review process. Of the 28 papers that evaluated AI approaches for relevance screening, sensitivity or recall ranged from 75% to 100%, specificity from 19% to 99%, precision from 8% to 83%, median potential screening time saved ranged from 9 to 185 hours per review, with overall workload reduction ranging from 6.89% to 70.74%, and error rates between 0% and 22% [<xref ref-type="bibr" rid="ref36">36</xref>]. Direct comparison of this project to the results of the studies identified in these reviews is challenging due to variations in the training, testing, and analysis approaches, as well as the inconsistent reporting of results. However, overall, the findings of these reviews align with those of this project, particularly concerning recall, specificity, and the reduction in the number of references that need to be screened manually.</p></sec><sec id="s4-3"><title>Lessons Learned</title><p>This project highlights several lessons learned on the use of preexisting AI tools for reference screening that may be important considerations for database management, specifically, or systematic review processes more generally. While the model training completed in this project was purpose-built for Health Evidence and cannot be directly applied to other platforms, describing the methods and results from this project may inform processes for others who are looking to reduce the number of manual references that need to be screened while minimizing false negatives. First, even with predeveloped software, the training, testing, and analysis of AI-assisted screening requires significant human resources. For the overall cost-benefit ratio to be favorable, the volume of references that require relevance screening should be substantial to justify the upfront investment. Curated evidence platforms like Health Evidence, which span a broad range of topics and generate over 10,000 search results each month, can find longer-term benefits in reduced screening time that justify the initial investment in training and testing. Second, access to large datasets that can be used as a reference standard for training and testing, such as what was available with Health Evidence as a longstanding curated evidence platform, is important to enable confidence in the adequate functioning of an AI-assisted approach for relevance screening. This approach may not be suited to smaller evidence platforms or systematic reviews that do not have access to such large datasets and reference standards. However, this approach may be transferable to systematic reviews with large search results, living systematic reviews, or systematic review updates if there is continuity in relevance criteria over time.</p><p>Third, concept drift is an important consideration for any AI model trained using historical data. Unlike AI models developed using continuous learning models, the model&#x2019;s performance may degrade over time. Conducting quality assurance testing at predetermined intervals is recommended to evaluate performance over time. Over 4 years of quality assurance checks, one reference was found to be incorrectly excluded at the full-text screening level, demonstrating continued performance of the AI model and suggesting that model retraining is not yet required. Through ongoing annual quality assurance testing, we continue to monitor model performance. Through this work, we learned that if a new topic is identified, its impact on model performance should be investigated and compared to a pre-established threshold (eg, &#x003C;1%). A separate hand search strategy can be implemented for distinct novel phenomena (eg, COVID-19). Continued monitoring for subtle shifts, such as new terminology, will inform if full retraining is required. A comprehensive assessment of optimal frequency and type of quality assurance practices is outside the scope of this project and is highly contextually dependent based on the purpose and scope of the platform, as well as how the sector evolves. These are important considerations, as frequent retraining of a model would require additional resources. While this project provides insight into the feasibility of integrating AI-assisted screening in similar contexts, the resources, training data, and monitoring practices required may vary depending on the complexity and scope of the research question or purpose of the curated platform.</p><p>Finally, while successfully applied to reduce the overall number of references requiring manual title and abstract screening, the results of this project do not support the use of AI screening as a replacement for manual screening. A thoughtful and cautious approach to establish where integration of AI could be most useful for similar types of platforms or projects is warranted.</p></sec><sec id="s4-4"><title>Implications for Practice</title><p>A key advantage of using AI-assisted screening to support Health Evidence is reducing the staff time required for title and abstract screening. While the full costs of training and implementing the AI feature were not calculated, the reduced screening time permits highly trained staff to be reallocated to more complex tasks in the Health Evidence monthly workflow that cannot be reliably augmented by AI. This ensures that public health decision makers can access quality appraised, newly emerging systematic reviews even more quickly. An advantage of using this AI-assisted screening approach is that it does not require frequent retraining for relevance screening. In contrast, training of new staff is needed when using only manual screening for reasons such as staff turnover or deployment to other project priorities. These factors help to reduce the resources required to keep Health Evidence up to date, increasing its long-term sustainability.</p></sec><sec id="s4-5"><title>Future Directions</title><p>The use of AI in database management and evidence synthesis is a rapidly evolving area with many gaps and areas for future research. To our knowledge, there are no other published reports on the practical application and real-world use of pre-existing AI tools to support curated evidence platforms. Studies published to date have explored a wide variety of AI approaches using different tools, for different purposes, and using different evaluation metrics, thus limiting the ability to draw overarching recommendations for AI use in practice, specifically for curated platforms. The authors encourage others to report on these types of continuous quality improvement projects to share learnings on the practical application of AI.</p><p>In this project, we used a pre-existing AI tool with the goal to minimize the number of references that need to be manually screened. For this purpose, the most relevant metrics to assess model performance included recall, specificity, negative predictive value, and absolute number of false negatives. In other studies of AI performance, <italic>F</italic><sub>1</sub>-score and the area under the receiver-operating characteristic curve are used when a balance between recall and specificity is required. Due to the second screening at the full text level, we did not choose to focus on minimizing false positives at this stage. As AI tools continue to evolve, future work could explore a greater focus on this, as well as including <italic>F</italic><sub>1</sub>-scores and area under the receiver-operating characteristic curve, to further general understanding of model functionality and optimization. Future work could also explore the use of explainable AI to better understand results and integrate learnings to improve the AI model. Investigating and comparing the value and risks of generative AI models compared to deterministic AI models for AI-assisted screening, including the use of continuous or incremental learning approaches, rather than manual thresholds, as used in this project, may offer further reductions in manual screening time. Finally, exploring additional applications of AI for other aspects of the evidence synthesis or Health Evidence monthly workflow, such as indexing, data extraction, and quality appraisal, and considering key issues regarding the ethical and equitable use of AI in database management and evidence synthesis are areas requiring further investigation.</p></sec><sec id="s4-6"><title>Conclusions</title><p>Given the magnitude of newly published peer-reviewed evidence, the curation of evidence supports decision makers in making informed decisions. AI-assisted screening can be an important tool to supplement manual screening and reduce the number of references that require manual screening, ensuring that the continued availability of curated high-quality synthesis evidence in public health is possible.</p></sec></sec></body><back><ack><p>The authors would like to thank DistillerSR for technical assistance in operating the platform to conduct testing and analysis. The National Collaborating Centre for Methods and Tools is hosted by McMaster University and funded by the Public Health Agency of Canada. The views expressed herein do not necessarily represent the views of the Public Health Agency of Canada. The funder had no role in the design of the study, collection, analysis, or interpretation of data, or in writing the manuscript.</p></ack><fn-group><fn fn-type="con"><p>KR and MD conceptualized and designed the study. KR and AM implemented the study. KR, AM, and AG performed analysis of study results and drafted the manuscript. MD, SEN-S, and ECC provided feedback on drafts and the final version of the manuscript.</p></fn><fn fn-type="conflict"><p>None declared.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AI</term><def><p>artificial intelligence</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="web"><article-title>What is public health?</article-title><source>Canadian Public Health Association</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cpha.ca/what-public-health">https://www.cpha.ca/what-public-health</ext-link></comment></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="web"><article-title>Evidence-informed decision making in public health</article-title><source>National Collaborating Centre for Methods and Tools</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.nccmt.ca/tools/eiph">https://www.nccmt.ca/tools/eiph</ext-link></comment></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dicenso</surname><given-names>A</given-names> </name><name name-style="western"><surname>Bayley</surname><given-names>L</given-names> </name><name name-style="western"><surname>Haynes</surname><given-names>RB</given-names> </name></person-group><article-title>Accessing pre-appraised evidence: fine-tuning the 5S model into a 6S model</article-title><source>Evidence Based Nurs</source><year>2009</year><month>10</month><volume>12</volume><issue>4</issue><fpage>99</fpage><lpage>101</lpage><pub-id pub-id-type="doi">10.1136/ebn.12.4.99-b</pub-id><pub-id pub-id-type="medline">19779069</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Robeson</surname><given-names>P</given-names> </name><name name-style="western"><surname>Dobbins</surname><given-names>M</given-names> </name><name name-style="western"><surname>DeCorby</surname><given-names>K</given-names> </name><name name-style="western"><surname>Tirilis</surname><given-names>D</given-names> </name></person-group><article-title>Facilitating access to pre-processed research evidence in public health</article-title><source>BMC Public Health</source><year>2010</year><month>02</month><day>24</day><volume>10</volume><fpage>1</fpage><lpage>10</lpage><pub-id pub-id-type="doi">10.1186/1471-2458-10-95</pub-id><pub-id pub-id-type="medline">20181270</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bowen</surname><given-names>S</given-names> </name><name name-style="western"><surname>Erickson</surname><given-names>T</given-names> </name><name name-style="western"><surname>Martens</surname><given-names>PJ</given-names> </name><name name-style="western"><surname>Crockett</surname><given-names>S</given-names> </name></person-group><article-title>More than &#x201C;using research&#x201D;: the real challenges in promoting evidence-informed decision-making</article-title><source>Healthc Policy</source><year>2009</year><month>02</month><volume>4</volume><issue>3</issue><fpage>87</fpage><lpage>102</lpage><pub-id pub-id-type="medline">19377360</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Humphries</surname><given-names>S</given-names> </name><name name-style="western"><surname>Stafinski</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mumtaz</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Menon</surname><given-names>D</given-names> </name></person-group><article-title>Barriers and facilitators to evidence-use in program management: a systematic review of the literature</article-title><source>BMC Health Serv Res</source><year>2014</year><month>04</month><day>14</day><volume>14</volume><fpage>1</fpage><lpage>15</lpage><pub-id pub-id-type="doi">10.1186/1472-6963-14-171</pub-id><pub-id pub-id-type="medline">24731719</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shafaghat</surname><given-names>T</given-names> </name><name name-style="western"><surname>Nasab</surname><given-names>MHI</given-names> </name><name name-style="western"><surname>Bahrami</surname><given-names>MA</given-names> </name><etal/></person-group><article-title>A mapping of facilitators and barriers to evidence-based management in health systems: a scoping review study</article-title><source>Syst Rev</source><year>2021</year><month>01</month><day>30</day><volume>10</volume><issue>1</issue><fpage>42</fpage><pub-id pub-id-type="doi">10.1186/s13643-021-01595-8</pub-id><pub-id pub-id-type="medline">33516269</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dobbins</surname><given-names>M</given-names> </name><name name-style="western"><surname>DeCorby</surname><given-names>K</given-names> </name><name name-style="western"><surname>Robeson</surname><given-names>P</given-names> </name><name name-style="western"><surname>Husson</surname><given-names>H</given-names> </name><name name-style="western"><surname>Tirilis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Greco</surname><given-names>L</given-names> </name></person-group><article-title>A knowledge management tool for public health: health-evidence.ca</article-title><source>BMC Public Health</source><year>2010</year><month>08</month><day>18</day><volume>10</volume><fpage>1</fpage><lpage>16</lpage><pub-id pub-id-type="doi">10.1186/1471-2458-10-496</pub-id><pub-id pub-id-type="medline">20718970</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="web"><article-title>Updating Health Evidence</article-title><source>Health Evidence</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.healthevidence.org/our-search-strategy.aspx">https://www.healthevidence.org/our-search-strategy.aspx</ext-link></comment></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="web"><article-title>Health Evidence quality assessment tool</article-title><source>Health Evidence</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.healthevidence.org/our-appraisal-tools.aspx">https://www.healthevidence.org/our-appraisal-tools.aspx</ext-link></comment></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Lee</surname><given-names>E</given-names> </name><name name-style="western"><surname>Dobbins</surname><given-names>M</given-names> </name><name name-style="western"><surname>Decorby</surname><given-names>K</given-names> </name><name name-style="western"><surname>McRae</surname><given-names>L</given-names> </name><name name-style="western"><surname>Tirilis</surname><given-names>D</given-names> </name><name name-style="western"><surname>Husson</surname><given-names>H</given-names> </name></person-group><article-title>An optimal search filter for retrieving systematic reviews and meta-analyses</article-title><source>BMC Med Res Methodol</source><year>2012</year><month>04</month><day>18</day><volume>12</volume><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1186/1471-2288-12-51</pub-id><pub-id pub-id-type="medline">22512835</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Aviv-Reuven</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rosenfeld</surname><given-names>A</given-names> </name></person-group><article-title>Publication patterns&#x2019; changes due to the COVID-19 pandemic: a longitudinal and short-term scientometric analysis</article-title><source>Scientometrics</source><year>2021</year><volume>126</volume><issue>8</issue><fpage>6761</fpage><lpage>6784</lpage><pub-id pub-id-type="doi">10.1007/s11192-021-04059-x</pub-id><pub-id pub-id-type="medline">34188333</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bastian</surname><given-names>H</given-names> </name><name name-style="western"><surname>Glasziou</surname><given-names>P</given-names> </name><name name-style="western"><surname>Chalmers</surname><given-names>I</given-names> </name></person-group><article-title>Seventy-five trials and eleven systematic reviews a day: how will we ever keep up?</article-title><source>PLoS Med</source><year>2010</year><month>09</month><day>21</day><volume>7</volume><issue>9</issue><fpage>e1000326</fpage><pub-id pub-id-type="doi">10.1371/journal.pmed.1000326</pub-id><pub-id pub-id-type="medline">20877712</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hoffmann</surname><given-names>F</given-names> </name><name name-style="western"><surname>Allers</surname><given-names>K</given-names> </name><name name-style="western"><surname>Rombey</surname><given-names>T</given-names> </name><etal/></person-group><article-title>Nearly 80 systematic reviews were published each day: observational study on trends in epidemiology and reporting over the years 2000-2019</article-title><source>J Clin Epidemiol</source><year>2021</year><month>10</month><volume>138</volume><fpage>1</fpage><lpage>11</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.05.022</pub-id><pub-id pub-id-type="medline">34091022</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Ioannidis</surname><given-names>JPA</given-names> </name></person-group><article-title>The mass production of redundant, misleading, and conflicted systematic reviews and meta-analyses</article-title><source>Milbank Q</source><year>2016</year><month>09</month><volume>94</volume><issue>3</issue><fpage>485</fpage><lpage>514</lpage><pub-id pub-id-type="doi">10.1111/1468-0009.12210</pub-id><pub-id pub-id-type="medline">27620683</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="web"><article-title>A common understanding: simplified AI definitions from leading standards</article-title><source>Digital NSW</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.digital.nsw.gov.au/policy/artificial-intelligence/a-common-understanding-simplified-ai-definitions-from-leading">https://www.digital.nsw.gov.au/policy/artificial-intelligence/a-common-understanding-simplified-ai-definitions-from-leading</ext-link></comment></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="web"><article-title>Information technology&#x2014;artificial intelligence&#x2014;artificial intelligence concepts and terminology</article-title><source>ISO/IEC</source><year>2022</year><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.iso.org/obp/ui/#iso:std:iso-iec:22989:ed-1:v1:en">https://www.iso.org/obp/ui/#iso:std:iso-iec:22989:ed-1:v1:en</ext-link></comment></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>MacAdden</surname><given-names>V</given-names> </name></person-group><article-title>A closer look at natural language processing in systematic reviews</article-title><source>DistillerSR</source><year>2023</year><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.distillersr.com/resources/blog/a-closer-look-at-natural-language-processing-in-systematic-reviews">https://www.distillersr.com/resources/blog/a-closer-look-at-natural-language-processing-in-systematic-reviews</ext-link></comment></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Beller</surname><given-names>E</given-names> </name><name name-style="western"><surname>Clark</surname><given-names>J</given-names> </name><name name-style="western"><surname>Tsafnat</surname><given-names>G</given-names> </name><etal/></person-group><article-title>Making progress with the automation of systematic reviews: principles of the International Collaboration for the Automation of Systematic Reviews (ICASR)</article-title><source>Syst Rev</source><year>2018</year><month>05</month><day>19</day><volume>7</volume><issue>1</issue><fpage>77</fpage><pub-id pub-id-type="doi">10.1186/s13643-018-0740-7</pub-id><pub-id pub-id-type="medline">29778096</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Marshall</surname><given-names>IJ</given-names> </name><name name-style="western"><surname>Wallace</surname><given-names>BC</given-names> </name></person-group><article-title>Toward systematic review automation: a practical guide to using machine learning tools in research synthesis</article-title><source>Syst Rev</source><year>2019</year><month>07</month><day>11</day><volume>8</volume><issue>1</issue><fpage>163</fpage><pub-id pub-id-type="doi">10.1186/s13643-019-1074-9</pub-id><pub-id pub-id-type="medline">31296265</pub-id></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>O&#x2019;Mara-Eves</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>McNaught</surname><given-names>J</given-names> </name><name name-style="western"><surname>Miwa</surname><given-names>M</given-names> </name><name name-style="western"><surname>Ananiadou</surname><given-names>S</given-names> </name></person-group><article-title>Using text mining for study identification in systematic reviews: a systematic review of current approaches</article-title><source>Syst Rev</source><year>2015</year><month>01</month><day>14</day><volume>4</volume><issue>1</issue><fpage>5</fpage><pub-id pub-id-type="doi">10.1186/2046-4053-4-5</pub-id><pub-id pub-id-type="medline">25588314</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="web"><article-title>Systematic review and literature review software by DistillerSR</article-title><source>DistillerSR</source><year>2024</year><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.distillersr.com/">https://www.distillersr.com/</ext-link></comment></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="web"><article-title>AI screening&#x2014; DistillerSR user guide</article-title><source>DistillerSR</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.manula.com/manuals/evidence-partners/distillersr/1/en/topic/ai-preview-and-rank">https://www.manula.com/manuals/evidence-partners/distillersr/1/en/topic/ai-preview-and-rank</ext-link></comment></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hinder</surname><given-names>F</given-names> </name><name name-style="western"><surname>Vaquet</surname><given-names>V</given-names> </name><name name-style="western"><surname>Brinkrolf</surname><given-names>J</given-names> </name><name name-style="western"><surname>Hammer</surname><given-names>B</given-names> </name></person-group><article-title>Model-based explanations of concept drift</article-title><source>Neurocomputing</source><year>2023</year><month>10</month><volume>555</volume><fpage>126640</fpage><pub-id pub-id-type="doi">10.1016/j.neucom.2023.126640</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="web"><article-title>Non-research activities (e.g, quality improvement)</article-title><source>Hamilton Integrated Research Ethics Board</source><year>2024</year><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.hireb.ca/guidelines/quality-assurance/">https://www.hireb.ca/guidelines/quality-assurance/</ext-link></comment></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Ting</surname><given-names>KM</given-names> </name></person-group><article-title>Confusion matrix</article-title><source>Encyclopedia of Machine Learning</source><year>2010</year><publisher-name>Springer</publisher-name><fpage>209</fpage><pub-id pub-id-type="doi">10.1007/978-0-387-30164-8_157</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Poton</surname><given-names>R</given-names> </name><name name-style="western"><surname>Polito</surname><given-names>MD</given-names> </name></person-group><article-title>The effects of aerobic training on the CD4 cells, VO2max, and metabolic parameters in HIV-infected patients: a meta-analysis of randomized controlled trials</article-title><source>J Sports Med Phys Fitness</source><year>2020</year><month>04</month><volume>60</volume><issue>4</issue><fpage>634</fpage><lpage>642</lpage><pub-id pub-id-type="doi">10.23736/S0022-4707.19.10261-7</pub-id><pub-id pub-id-type="medline">31818061</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tao</surname><given-names>X</given-names> </name><name name-style="western"><surname>Lu</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Zhou</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Huang</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>Y</given-names> </name></person-group><article-title>Virologically suppressed HIV-infected patients on TDF-containing regimens significantly benefit from switching to TAF-containing regimens: a meta-analysis of randomized controlled trials</article-title><source>Int J Infect Dis</source><year>2019</year><month>10</month><volume>87</volume><fpage>43</fpage><lpage>53</lpage><pub-id pub-id-type="doi">10.1016/j.ijid.2019.07.011</pub-id><pub-id pub-id-type="medline">31330323</pub-id></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Deyno</surname><given-names>S</given-names> </name><name name-style="western"><surname>Eneyew</surname><given-names>K</given-names> </name><name name-style="western"><surname>Seyfe</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Efficacy and safety of cinnamon in type 2 diabetes mellitus and pre-diabetes patients: a meta-analysis and meta-regression</article-title><source>Diabetes Res Clin Pract</source><year>2019</year><month>10</month><volume>156</volume><fpage>107815</fpage><pub-id pub-id-type="doi">10.1016/j.diabres.2019.107815</pub-id><pub-id pub-id-type="medline">31425768</pub-id></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Verboven</surname><given-names>M</given-names> </name><name name-style="western"><surname>Van Ryckeghem</surname><given-names>L</given-names> </name><name name-style="western"><surname>Belkhouribchia</surname><given-names>J</given-names> </name><etal/></person-group><article-title>Effect of exercise intervention on cardiac function in type 2 diabetes mellitus: a systematic review</article-title><source>Sports Med</source><year>2019</year><month>02</month><volume>49</volume><issue>2</issue><fpage>255</fpage><lpage>268</lpage><pub-id pub-id-type="doi">10.1007/s40279-018-1003-4</pub-id><pub-id pub-id-type="medline">30357657</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="web"><article-title>Epistemonikos: database of the best evidence-based health care</article-title><source>Epistemonikos</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.epistemonikos.org/">https://www.epistemonikos.org/</ext-link></comment></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="web"><article-title>McMaster health forum</article-title><source>Social Systems Evidence</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.socialsystemsevidence.org/">https://www.socialsystemsevidence.org/</ext-link></comment></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="web"><article-title>JBI EBP database</article-title><source>Joanna Briggs Institute</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://jbi.global/jbi-ebp-database">https://jbi.global/jbi-ebp-database</ext-link></comment></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="web"><source>Essential Evidence Plus</source><access-date>2025-07-03</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.essentialevidenceplus.com/">https://www.essentialevidenceplus.com/</ext-link></comment></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Abdelkader</surname><given-names>W</given-names> </name><name name-style="western"><surname>Navarro</surname><given-names>T</given-names> </name><name name-style="western"><surname>Parrish</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Machine learning approaches to retrieve high-quality, clinically relevant evidence from the biomedical literature: systematic review</article-title><source>JMIR Med Inform</source><year>2021</year><month>09</month><day>9</day><volume>9</volume><issue>9</issue><fpage>e30401</fpage><pub-id pub-id-type="doi">10.2196/30401</pub-id><pub-id pub-id-type="medline">34499041</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Khalil</surname><given-names>H</given-names> </name><name name-style="western"><surname>Ameen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Zarnegar</surname><given-names>A</given-names> </name></person-group><article-title>Tools to support the automation of systematic reviews: a scoping review</article-title><source>J Clin Epidemiol</source><year>2022</year><month>04</month><volume>144</volume><fpage>22</fpage><lpage>42</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2021.12.005</pub-id><pub-id pub-id-type="medline">34896236</pub-id></nlm-citation></ref></ref-list></back></article>