<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e52758</article-id>
      <article-id pub-id-type="pmid">39151163</article-id>
      <article-id pub-id-type="doi">10.2196/52758</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Human-Comparable Sensitivity of Large Language Models in Identifying Eligible Studies Through Title and Abstract Screening: 3-Layer Strategy Using GPT-3.5 and GPT-4 for Systematic Reviews</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Ma</surname>
            <given-names>Simone</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Fraile Navarro</surname>
            <given-names>David</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nguyen</surname>
            <given-names>Tam</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Nakhostin-Ansari</surname>
            <given-names>Amin</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Matsui</surname>
            <given-names>Kentaro</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-4538-5381</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>Utsumi</surname>
            <given-names>Tomohiro</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-4673-7230</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Aoki</surname>
            <given-names>Yumi</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff4" ref-type="aff">4</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-2674-0707</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Maruki</surname>
            <given-names>Taku</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff5" ref-type="aff">5</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0009-3938-5466</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author">
          <name name-style="western">
            <surname>Takeshima</surname>
            <given-names>Masahiro</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff6" ref-type="aff">6</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-0614-7524</ext-link>
        </contrib>
        <contrib id="contrib6" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Takaesu</surname>
            <given-names>Yoshikazu</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff7" ref-type="aff">7</xref>
          <address>
            <institution>Department of Neuropsychiatry</institution>
            <institution>Graduate School of Medicine</institution>
            <institution>University of the Ryukyus</institution>
            <addr-line>207 Uehara</addr-line>
            <addr-line>Nishihara</addr-line>
            <addr-line>Okinawa, 903-0215</addr-line>
            <country>Japan</country>
            <phone>81 98 895 3331</phone>
            <email>takaesuy@med.u-ryukyu.ac.jp</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-9169-3249</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Clinical Laboratory, National Center Hospital</institution>
        <institution>National Center of Neurology and Psychiatry</institution>
        <addr-line>Kodaira</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Department of Sleep-Wake Disorders, National Institute of Mental Health</institution>
        <institution>National Center of Neurology and Psychiatry</institution>
        <addr-line>Kodaira</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Psychiatry</institution>
        <institution>The Jikei University School of Medicine</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff4">
        <label>4</label>
        <institution>Graduate School of Nursing Science</institution>
        <institution>St. Luke’s International University</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff5">
        <label>5</label>
        <institution>Department of Neuropsychiatry</institution>
        <institution>Kyorin University School of Medicine</institution>
        <addr-line>Tokyo</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff6">
        <label>6</label>
        <institution>Department of Neuropsychiatry</institution>
        <institution>Akita University Graduate School of Medicine</institution>
        <addr-line>Akita</addr-line>
        <country>Japan</country>
      </aff>
      <aff id="aff7">
        <label>7</label>
        <institution>Department of Neuropsychiatry</institution>
        <institution>Graduate School of Medicine</institution>
        <institution>University of the Ryukyus</institution>
        <addr-line>Okinawa</addr-line>
        <country>Japan</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Yoshikazu Takaesu <email>takaesuy@med.u-ryukyu.ac.jp</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>16</day>
        <month>8</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e52758</elocation-id>
      <history>
        <date date-type="received">
          <day>14</day>
          <month>9</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>23</day>
          <month>1</month>
          <year>2024</year>
        </date>
        <date date-type="rev-recd">
          <day>10</day>
          <month>3</month>
          <year>2024</year>
        </date>
        <date date-type="accepted">
          <day>25</day>
          <month>6</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Kentaro Matsui, Tomohiro Utsumi, Yumi Aoki, Taku Maruki, Masahiro Takeshima, Yoshikazu Takaesu. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 16.08.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e52758" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>The screening process for systematic reviews is resource-intensive. Although previous machine learning solutions have reported reductions in workload, they risked excluding relevant papers.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>We evaluated the performance of a 3-layer screening method using GPT-3.5 and GPT-4 to streamline the title and abstract-screening process for systematic reviews. Our goal is to develop a screening method that maximizes sensitivity for identifying relevant records.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>We conducted screenings on 2 of our previous systematic reviews related to the treatment of bipolar disorder, with 1381 records from the first review and 3146 from the second. Screenings were conducted using GPT-3.5 (gpt-3.5-turbo-0125) and GPT-4 (gpt-4-0125-preview) across three layers: (1) research design, (2) target patients, and (3) interventions and controls. The 3-layer screening was conducted using prompts tailored to each study. During this process, information extraction according to each study’s inclusion criteria and optimization for screening were carried out using a GPT-4–based flow without manual adjustments. Records were evaluated at each layer, and those meeting the inclusion criteria at all layers were subsequently judged as included.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>On each layer, both GPT-3.5 and GPT-4 were able to process about 110 records per minute, and the total time required for screening the first and second studies was approximately 1 hour and 2 hours, respectively. In the first study, the sensitivities/specificities of the GPT-3.5 and GPT-4 were 0.900/0.709 and 0.806/0.996, respectively. Both screenings by GPT-3.5 and GPT-4 judged all 6 records used for the meta-analysis as included. In the second study, the sensitivities/specificities of the GPT-3.5 and GPT-4 were 0.958/0.116 and 0.875/0.855, respectively. The sensitivities for the relevant records align with those of human evaluators: 0.867-1.000 for the first study and 0.776-0.979 for the second study. Both screenings by GPT-3.5 and GPT-4 judged all 9 records used for the meta-analysis as included. After accounting for justifiably excluded records by GPT-4, the sensitivities/specificities of the GPT-4 screening were 0.962/0.996 in the first study and 0.943/0.855 in the second study. Further investigation indicated that the cases incorrectly excluded by GPT-3.5 were due to a lack of domain knowledge, while the cases incorrectly excluded by GPT-4 were due to misinterpretations of the inclusion criteria.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>Our 3-layer screening method with GPT-4 demonstrated acceptable level of sensitivity and specificity that supports its practical application in systematic review screenings. Future research should aim to generalize this approach and explore its effectiveness in diverse settings, both medical and nonmedical, to fully establish its use and operational feasibility.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>systematic review</kwd>
        <kwd>screening</kwd>
        <kwd>GPT-3.5</kwd>
        <kwd>GPT-4</kwd>
        <kwd>language model</kwd>
        <kwd>information science</kwd>
        <kwd>library science</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>prompt engineering</kwd>
        <kwd>meta-analysis</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Large language models (LLMs) with extensive parameters, honed on substantial textual data, have seen striking advancements recently. Following OpenAI’s third-generation Generative Pre-trained Transformer (GPT-3), LLMs now possess advanced competencies in various natural language processing tasks [<xref ref-type="bibr" rid="ref1">1</xref>]. Among these, ChatGPT, which is built on GPT-3.5—an iteration that improves upon GPT-3 by integrating both supervised and reinforcement learning techniques—has received particular attention [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref3">3</xref>]. GPT-3.5 has shown exceptional performance in the medical domain, achieving remarkable results on medical licensing examinations across different regions [<xref ref-type="bibr" rid="ref4">4</xref>]. Furthermore, GPT-4, the successor to GPT-3.5, has exhibited superior performance [<xref ref-type="bibr" rid="ref5">5</xref>], with its contextual understanding abilities potentially exceeding those of humans [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>]. Beyond its use for language editing [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref9">9</xref>], both GPT-3.5 and GPT-4 have proven to be effective tools for analyzing and comprehending the abstracts of research papers, offering potential benefits in the screening process for systematic reviews.</p>
      <p>Systematic reviews and subsequent meta-analyses bear crucial clinical significance. The screening of titles and abstracts is a crucial step in this process [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref13">13</xref>], often involving more than 1000 papers identified via targeted keyword searches [<xref ref-type="bibr" rid="ref14">14</xref>]. This screening process can take approximately 1 hour for every 60-120 papers [<xref ref-type="bibr" rid="ref10">10</xref>], which is a substantial drain on human and time resources. In addition, human error is inevitable in the screening process [<xref ref-type="bibr" rid="ref15">15</xref>-<xref ref-type="bibr" rid="ref17">17</xref>], and the number of such errors can increase as the amount of paper to be screened increases possibly due to fatigue and cognitive overload [<xref ref-type="bibr" rid="ref18">18</xref>,<xref ref-type="bibr" rid="ref19">19</xref>]. To mitigate this labor-intensive task, attempts have been made to use text mining and machine learning technologies [<xref ref-type="bibr" rid="ref17">17</xref>,<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. Although these methods have successfully reduced the workload, they risk omitting relevant papers, which could result in a high false-negative rate. Specifically, several studies reported the exclusion of records that should have been included in the meta-analysis [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Consequently, using machine learning techniques, such as natural language processing, to assist with abstract screening has not yet become widely adopted [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref30">30</xref>]. For systematic reviews, maintaining high sensitivity for studies eligible for full-text assessment, ideally at 100% [<xref ref-type="bibr" rid="ref10">10</xref>], is crucial if they are to be fully supplanted by an automated process.</p>
      <p>With the advanced language-processing capabilities of GPT-3.5 and GPT-4 [<xref ref-type="bibr" rid="ref2">2</xref>,<xref ref-type="bibr" rid="ref5">5</xref>], there has been an expectation of achieving higher accuracy in screening processes. Kohandel Gargari et al [<xref ref-type="bibr" rid="ref31">31</xref>] conducted title and abstract screening using GPT-3.5, but the sensitivity for identifying relevant papers remained at a maximum of 69%, even after attempting various prompt modifications. Khraisha et al [<xref ref-type="bibr" rid="ref32">32</xref>] explored the use of GPT-4 across different systematic review processes and found that the sensitivity for title and abstract screening ranged between 42% and 50%. Guo et al [<xref ref-type="bibr" rid="ref33">33</xref>] have also demonstrated the use of GPT-4 in title and abstract screenings; however, the sensitivity for relevant papers was limited to 76%, highlighting the challenge of unintentionally excluding necessary records. Notably, Tran et al [<xref ref-type="bibr" rid="ref34">34</xref>] used GPT-3.5 for title and abstract screening with rigorous prompt adjustments, achieving a high sensitivity of 97.1% for relevant papers. While this high-sensitivity level might already be suitable for practical use in the systematic review process, its specificity was limited to 37.7% [<xref ref-type="bibr" rid="ref34">34</xref>].</p>
      <p>The aim of this study is to develop a title- and abstract-screening method using GPT-3.5 and GPT-4 that achieves as high a sensitivity as possible. Although the method of using GPT-3.5 by Tran et al [<xref ref-type="bibr" rid="ref34">34</xref>] achieved high sensitivity for identifying relevant papers, we aim to maintain high sensitivity while also improving specificity through a unique approach that incorporates GPT-4. To achieve this, we subdivided the process of determining inclusion for systematic reviews [<xref ref-type="bibr" rid="ref11">11</xref>] involving 3 layers of screening. By breaking down the screening process into multiple steps, each addressing a specific aspect, we aimed to optimize the performance of the language models. In this study, we regarded the results of human screening as the gold standard and calculated the sensitivity and specificity of the GPT-3.5 and GPT-4 screening results in comparison with them. Furthermore, we carefully examined the records that were erroneously excluded by GPT-3.5/GPT-4. This examination was conducted to assess the appropriateness of their exclusion.</p>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Language Model Details</title>
        <p>GPT-3.5 and GPT-4, LLMs used in this study, are accessible through ChatGPT. However, ChatGPT does not support processing multiple queries against the titles and abstracts of scholarly papers simultaneously. To address this limitation, we leveraged the application programming interfaces (APIs) of GPT-3.5 and GPT-4, known as gpt-3.5-turbo and gpt-4-turbo-preview, respectively [<xref ref-type="bibr" rid="ref35">35</xref>].</p>
        <p>For gpt-3.5-turbo, we used the most current model available, gpt-3.5-turbo-0125. This model could be used at a low cost of US $0.50 per 1M tokens for input and US $1.50 per 1M tokens for output, with approximately 750 tokens corresponding to 1000 words [<xref ref-type="bibr" rid="ref36">36</xref>]. Similarly, for GPT-4, we used the latest model available, gpt-4-0125-preview, which was available at a cost of US $10.00 per 1M tokens for input and US $30.00 per 1M tokens for output [<xref ref-type="bibr" rid="ref36">36</xref>].</p>
      </sec>
      <sec>
        <title>Calling the GPT-3.5 and GPT-4 API</title>
        <p>In this study, we used Google Spreadsheet and Google Apps Script to interface with the GPT-3.5 and GPT-4 APIs for batch processing. Specifically, we created the “GPT35” function to call the gpt-3.5-turbo-0125 API within Google Spreadsheet. Users can invoke this function by entering “=GPT35([prompt])” into a cell, enabling the intuitive batch processing of multiple titles and abstracts. Similarly, we established the “GPT4” function to access the gpt-4-0125-preview API.</p>
        <p>Both the gpt-3.5-turbo-0125 and gpt-4-0125-preview have a parameter called “temperature,” which introduces “variability” in the responses—the higher the temperature, the greater the randomness, with a range between 0 and 2 [<xref ref-type="bibr" rid="ref37">37</xref>]. As described later in this study, the decision to include or exclude records was delegated to GPT-3.5 and GPT-4. At the preliminary trials, it was observed that setting the temperature above 0 resulted in varying responses from one trial to another. In addition, setting the temperature above 0 can lead to unexpected responses. When instructed to respond with either “E” (for the exclusion) or “I” (for the inclusion), if the temperature is 0, the output will be strictly “E” or “I.” However, if the temperature is above 0, even if it is only 0.1, the response might be, for example, “The answer is ‘E’.” In light of these observations, and primarily to ensure reproducibility, this study fixed the temperature at 0 for all screenings. The Apps Script used in this study is shown in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Process of Screening and Prompt Engineering</title>
        <p>Generally, in a systematic review, a comprehensive examination is conducted on studies that address a relevant clinical question. After a comprehensive literature search is performed to identify all potential studies for review, each record is assessed to determine whether it addresses the clinical question [<xref ref-type="bibr" rid="ref11">11</xref>]. In this study, we used either GPT-3.5 or GPT-4 to assess the inclusion or exclusion of relevant papers at each of the following three layers: (1) research design, (2) target population, and (3) intervention and control [<xref ref-type="bibr" rid="ref11">11</xref>]. Records not deemed for exclusion at any of these layers were classified as “included.” We present the workflow of the process we conducted in <xref rid="figure1" ref-type="fig">Figure 1</xref>.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Three-layer screening process using GPT-3.5 and GPT-4 for literature review.</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e52758_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>The characteristics of the 2 systematic review papers [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>] used in this study are summarized in <xref ref-type="table" rid="table1">Table 1</xref>. The first paper by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>] investigated the efficacy of bright light therapy in patients with bipolar disorder. In this study, the titles and abstracts of a total of 1381 records were initially screened in duplicate, with the task being divided between 2 pairs of independent evaluators. The first pair reviewed the initial 753 records, while the second pair assessed the remaining 628 records. Of these, 30 records were targeted for a full-text assessment, and eventually 6 records (encompassing 6 studies) were selected for meta-analysis. The second paper by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>] verified the difference in therapeutic effects between the usage of 2 types: second-generation antipsychotics (SGAs) and mood stabilizers (MSs), versus the usage of either type alone, targeting patients with bipolar disorder. In this study, the titles and abstracts of a total of 3146 records were initially screened in duplicate, with the screening divided between 2 pairs of evaluators. The first pair reviewed the initial 1694 records, while the second pair evaluated the remaining 1452 records. Of these, 96 records were targeted for a full-text assessment, and eventually 9 records (encompassing 5 studies) were selected for meta-analysis. We used the data on the inclusion or exclusion decisions of each human evaluator made prior to reaching a consensus among evaluators.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Characteristic of the 2 selected systematic review studies.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="250"/>
            <col width="250"/>
            <thead>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Takeshima et al (2020) [<xref ref-type="bibr" rid="ref38">38</xref>]</td>
                <td>Maruki et al (2022) [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Clinical question</td>
                <td>Is bright light therapy an effective and safe treatment for managing manic and depressive symptoms in patients with bipolar disorder, and can it also be used as a preventive measure for recurrent mood episodes?</td>
                <td>Does the use of second-generation antipsychotics (SGA) or mood stabilizers (MS) as adjunctive therapy improve the efficacy and safety outcomes compared to their use as monotherapy in the treatment of bipolar depression?</td>
              </tr>
              <tr valign="top">
                <td>Databases</td>
                <td>Ovid MEDLINE, Cochrane Central Register of Controlled Trials, Embase, PsycINFO, and ClinicalTrials.gov</td>
                <td>PubMed, Cochrane Central Register of Controlled Trials, and Embase</td>
              </tr>
              <tr valign="top">
                <td>Number of records screened</td>
                <td>1381</td>
                <td>3146</td>
              </tr>
              <tr valign="top">
                <td>Number of records for full-text assessment</td>
                <td>30</td>
                <td>96</td>
              </tr>
              <tr valign="top">
                <td>Number of records (studies) included in quantitative synthesis</td>
                <td>6 (6)</td>
                <td>9 (5)</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>The screening process was divided into three layers: (1) research design, (2) target population, and (3) intervention and control. The prompts for each layer must be specifically tailored to each systematic review. At this point, manual prompt adjustments could lead to issues with reproducibility in future research. Therefore, in this study, we used GPT-4 (gpt-4-0125-preview, temperature=0) to automatically extract the information and generate the content for the prompts related to “research design,” “target population,” “intervention,” and “control.” The prompts used for extraction, along with the content defined for “research design,” “target population,” “intervention,” and “control,” are detailed in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref>. In this study, we extracted information by inserting the text from the “inclusion criteria” paragraph of the Methods section of each paper into the specified location in the prompt (<xref ref-type="boxed-text" rid="box1">Textbox 1</xref>).</p>
        <p>The structure of the prompts for each of the 3 layers is shown in <xref ref-type="boxed-text" rid="box2">Textbox 2</xref>. Within these prompts, we specified that if a decision cannot be made, records should be considered potentially eligible for full-text assessment and not excluded. In this study, the information supplied to GPT-3.5 and GPT-4 was limited to the titles and abstracts of the records; details such as authors, their affiliations, or journal names were not included in the prompts.</p>
        <p>In the screening process using GPT-3.5 or GPT-4, we initially verified whether the research design of all records satisfied the inclusion criteria. For records not excluded in the first layer, we subsequently confirmed whether the target population aligned with the inclusion criteria. Moreover, for records that were not excluded in the first and second layers, we assessed whether both the intervention and control groups met the inclusion criteria (<xref rid="figure1" ref-type="fig">Figure 1</xref>).</p>
        <boxed-text id="box1" position="float">
          <title>Prompts used for extracting information and the extracted details on research design, target population, intervention, and control from each study.</title>
          <list list-type="bullet">
            <list-item>
              <p>Prompt for extracting information</p>
              <p>#Inclusion criteria</p>
              <p>
                <italic>[The descriptions of the inclusion criteria presented within the Methods section of each paper were inserted here]</italic>
              </p>
              <p>#Query</p>
              <p>The above is an inclusion criteria of a systematic review paper. Please describe the records included in the meta-analysis in a sentence: 1) research design, 2) target population, 3) intervention, and 4) control. Please list each item without including verbs.</p>
              <p>#Your answer</p>
              <list list-type="order">
                <list-item>
                  <p>Research design: [insert your answer here]</p>
                </list-item>
                <list-item>
                  <p>Target population: [insert your answer here]</p>
                </list-item>
                <list-item>
                  <p>Intervention: [insert your answer here]</p>
                </list-item>
                <list-item>
                  <p>Control: [insert your answer here]</p>
                </list-item>
              </list>
            </list-item>
          </list>
          <list list-type="bullet">
            <list-item>
              <p>Response to the above prompt</p>
              <p>Inclusion criteria of Takeshima et al (2020) [<xref ref-type="bibr" rid="ref38">38</xref>], extracted by GPT-4</p>
              <list list-type="order">
                <list-item>
                  <p>Research design: Randomized controlled trials (RCTs) at the individual or cluster level, including crossover studies reporting results from the first period.</p>
                </list-item>
                <list-item>
                  <p>Target population: Patients with a clinical diagnosis of bipolar disorder (BD), type I or type II.</p>
                </list-item>
                <list-item>
                  <p>Intervention: Any kind of light therapy, including 'light therapy,' 'bright light therapy,' 'phototherapy,' or chronotherapy in any intensity and color.</p>
                </list-item>
                <list-item>
                  <p>Control: Sham treatment (e.g., low-intensity light, dim red light, or negative ion) or treatment as usual (no light treatment).</p>
                </list-item>
              </list>
              <p>Inclusion criteria of Maruki et al (2022) [<xref ref-type="bibr" rid="ref39">39</xref>], extracted by GPT-4</p>
              <list list-type="order">
                <list-item>
                  <p>Research design: Randomized controlled trials (RCTs) at the individual or cluster level, including crossover studies before crossover</p>
                </list-item>
                <list-item>
                  <p>Target population: Participants diagnosed with bipolar I or II depression, including mixed features and/or rapid cycling.</p>
                </list-item>
                <list-item>
                  <p>Intervention: Adjunctive therapy with second-generation antipsychotics (SGA) or mood stabilizers (MS) during baseline treatment with SGA or MS.</p>
                </list-item>
                <list-item>
                  <p>Control: Adjunctive therapy with a placebo during baseline treatment with second-generation antipsychotics (SGA) or mood stabilizers (MS).</p>
                </list-item>
              </list>
            </list-item>
          </list>
        </boxed-text>
        <boxed-text id="box2" position="float">
          <title>The structure of the prompts used for each of the 3 layers in the screening process.</title>
          <list list-type="order">
            <list-item>
              <p>Prompt for research design</p>
              <p>#Title and abstract</p>
              <p>Title: [<italic>Title of the record was inserted here</italic>]</p>
              <p>Abstract: [<italic>Abstract of the record was inserted here</italic>]</p>
              <p>#Research design</p>
              <p>[<italic>The ‘research design’ specified in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> was inserted here</italic>]</p>
              <p>#Query</p>
              <p>You are a researcher rigorously screening titles and abstracts of scientific papers for inclusion or exclusion in a review paper.</p>
              <p>Does the paper with the above title and abstract meet the specified research design? If yes, highly suspected, or difficult to determine, answer 'I'. If not, answer 'E'.</p>
              <p>#Rules</p>
              <p>You can reply using only 'E' or 'I'.</p>
              <p>#Your answer:</p>
            </list-item>
            <list-item>
              <p>Prompt for target population</p>
              <p>#Title and Abstract</p>
              <p>Title: [<italic>Title of the record was inserted here</italic>]</p>
              <p>Abstract: [<italic>Abstract of the record was inserted here</italic>]</p>
              <p>#Target population</p>
              <p>[<italic>The</italic> <italic>‘target population’ specified in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> was inserted here</italic>]</p>
              <p>#Query</p>
              <p>You are a researcher rigorously screening titles and abstracts of scientific papers for inclusion or exclusion in a review paper.</p>
              <p>Does the paper with the above title and abstract meet the specified target population? If yes, highly suspected, or difficult to determine, answer ‘I’. If not, answer ‘E’.</p>
              <p>#Rules</p>
              <p>You can reply using only ‘E’ or ‘I’.</p>
              <p>#Your answer:</p>
            </list-item>
            <list-item>
              <p>Prompt for intervention and control</p>
              <p>#Title and abstract</p>
              <p>Title: [<italic>Title of the record was inserted here</italic>]</p>
              <p>Abstract: [<italic>Abstract of the record was inserted here</italic>]</p>
              <p>#Intervention</p>
              <p>[<italic>The ‘intervention’ specified in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> was inserted here</italic>]</p>
              <p>#Control</p>
              <p>[<italic>The ‘control’ specified in <xref ref-type="boxed-text" rid="box1">Textbox 1</xref> was inserted here</italic>]</p>
              <p>#Query</p>
              <p>You are a researcher rigorously screening titles and abstracts of scientific papers for inclusion or exclusion in a review paper.</p>
              <p>Does the paper with the above title and abstract meet the specified intervention and control criteria? If yes, highly suspected, or difficult to determine, answer 'I'. If not, answer 'E'.</p>
              <p>#Rules</p>
              <p>You can reply using only 'E' or 'I'.</p>
              <p>#Your answer:</p>
            </list-item>
          </list>
        </boxed-text>
      </sec>
      <sec>
        <title>Data Analysis</title>
        <p>In this study, we analyzed the results from human evaluators of systematic review papers, comparing these with the records identified by GPT-3.5 or GPT-4. We considered the records included in the full-text assessment to be correct. We assessed the inclusion or exclusion decisions made by each human evaluator (before consensus was reached) against those determined by GPT-3.5 or GPT-4, focusing on sensitivity and specificity. Sensitivity was defined as the proportion of correctly identified eligible records for full-text assessment by human evaluators, GPT-3.5, or GPT-4. Formally, sensitivity is calculated as follows:</p>
        <disp-formula>Sensitivity = True positives / (True positives + False negatives)</disp-formula>
        <p>where:</p>
        <disp-formula>True positives = Number of records correctly identified as eligible</disp-formula>
        <disp-formula>False negatives = Number of records incorrectly identified as ineligible.</disp-formula>
        <p>Similarly, specificity was defined as the proportion of correctly identified ineligible records (for full-text assessment) by human evaluators, GPT-3.5, or GPT-4. Formally, specificity is calculated as follows:</p>
        <disp-formula>Specificity = True negatives / (True negatives + False positives)</disp-formula>
        <p>where:</p>
        <disp-formula>True negatives = Number of records correctly identified as ineligible</disp-formula>
        <disp-formula>False Positives = Number of records incorrectly identified as eligible.</disp-formula>
        <p>For records eligible for full-text assessment but excluded by either GPT-3.5 or GPT-4, we reviewed the title and the abstract to assess whether the exclusion decision was justified. Following this review, we recalculated sensitivity and specificity after adjusting for these justified exclusions. Furthermore, for records that were incorrectly excluded by GPT-3.5 or GPT-4, we conducted a narrative verification of the erroneous judgments by asking each LLM to explain the reasons behind their decisions. We modified the prompt used for screening (<xref ref-type="boxed-text" rid="box2">Textbox 2</xref>) by replacing the “#Rules” statement with “Specify the reason for your answer.” This modification allowed GPT-3.5 or GPT-4 to provide their judgment results along with the underlying reasons.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study used only publicly available data from research papers and does not involve human subjects or personal data. Therefore, it does not require a human subject ethics review or exemption.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Results on Takeshima et al Paper</title>
        <p><xref rid="figure2" ref-type="fig">Figure 2</xref> [<xref ref-type="bibr" rid="ref38">38</xref>] shows the number of records excluded by GPT-3.5 and GPT-4 at each layer of research design, target population, and intervention and control, applied to records in the paper by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Comparison of 3-layer screening results using GPT-3.5 and GPT-4 with human evaluation for Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>].</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e52758_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>GPT-3.5 excluded 84 records at the research design layer, 877 records at the target population layer, and 0 record at the intervention and control layer, ultimately determining 420 out of 1382 records for inclusion. None of the 6 records (including 6 papers) that were included in the meta-analysis were excluded by GPT-3.5. The sensitivity for included records was 0.900 and the specificity was 0.709. Among the eligible records for full-text assessment, GPT-3.5 classified 3 (10.0%) records as excluded. Of these, the exclusion of 2 records by GPT-3.5 was justified, while the remaining 1 (3.3%) record was deemed to require full-text assessment (<xref ref-type="table" rid="table2">Table 2</xref>). After adjustments for these justified judgments (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>), the sensitivity improved to 0.966 and the specificity remained at 0.710. For the one record that GPT-3.5 determined to be excluded at the target population layer, it was suggested that GPT-3.5 concluded that the record “included both bipolar disorder and unipolar mood disorder, which did not match the selection criteria.”</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Records for full-text assessment in the study by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>] paper but were excluded by GPT-3.5 and GPT-4.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="410"/>
            <col width="180"/>
            <col width="180"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="3">Number of excluded records on each layer (number of those not justified)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Research design</td>
                <td>Target population</td>
                <td>Intervention and control</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="5">
                  <bold>Number of records eligible for full-text assessment (n=30)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Excluded by GPT-3.5</td>
                <td>0</td>
                <td>3 (1)<sup>a</sup></td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Excluded by GPT-4</td>
                <td>4 (1)<sup>a</sup></td>
                <td>2 (0)<sup>a</sup></td>
                <td>0</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Number of records for which exclusion was not justified.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>GPT-4 excluded 589 records at the research design layer, 760 records at the target population layer, and 1 record at the intervention and control layer, ultimately determining 31 out of 1381 records for inclusion. None of the 6 records (including 6 papers) that were included in the meta-analysis were excluded by GPT-4. The sensitivity for included records was 0.806 and the specificity was 0.996. Among the eligible records for full-text assessment, GPT-4 classified 6 (20.0%) records as excluded. Of these, the exclusion of 5 records by GPT-4 was justified, while the remaining 1 (3.3%) record was considered to require full-text assessment (<xref ref-type="table" rid="table2">Table 2</xref>). After adjustments for these justified judgments (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>), the sensitivity improved to 0.962 and the specificity remained at 0.996. GPT-4 included all 6 records (including 6 papers) that were included in the meta-analysis. For the one record that GPT-4 judged to be excluded at the research design layer, it was revealed that GPT-4 deduced that “although this study mentioned registration in an RCT, it investigated the associations between sleep, physical activity, and circadian rhythm indicators” (from the perspective of whether to include the study in the meta-analysis, GPT-4’s judgment is likely to be correct; however, considering the purpose of the initial screening, we determined that it would be appropriate to include the study).</p>
      </sec>
      <sec>
        <title>Results of the Paper by Maruki et al</title>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> [<xref ref-type="bibr" rid="ref39">39</xref>] shows the number of records excluded by GPT-3.5 and GPT-4 at each layer of research design, target population, and intervention and control, applied to records in the Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>] paper.</p>
        <p>GPT-3.5 excluded 220 records at the research design layer, 126 records at the target population layer, and 10 records at the intervention and control layer, ultimately determining 2790 out of 3146 records for inclusion. None of the 9 records (including 9 papers) that were included in the meta-analysis were excluded by GPT-3.5. The sensitivity for included records was 0.958 and the specificity was 0.116. Among the eligible records for full-text assessment, GPT-3.5 classified 4 (4.2%) records as excluded. None of these records’ exclusion by GPT-3.5 was justified, and all were considered to require full-text assessment (<xref ref-type="table" rid="table3">Table 3</xref> and <xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>). For the 2 records that GPT-3.5 inferred to be excluded at the research design layer, it was revealed that GPT-3.5 determined that “although they were RCTs, either the individual or cluster level was not specified” for both records. For the 2 records that GPT-3.5 deemed to be excluded at the target population layer, it was suggested that GPT-3.5 surmised that “although the records involved bipolar disorder, they did not match the selection criteria due to the presence of comorbidities (one record had generalized anxiety disorder, and the other had alcohol dependence).”</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Comparison of 3-layer screening results using GPT-3.5 and GPT-4 with human evaluation for Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>].</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e52758_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Records for full-text assessment in the paper by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>] but were excluded by GPT-3.5 and GPT-4.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="420"/>
            <col width="180"/>
            <col width="180"/>
            <col width="190"/>
            <thead>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td colspan="3">Number of excluded records on each layer (number of those not justified)</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>Research design</td>
                <td>Target population</td>
                <td>Intervention and control</td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
                <td>
                  <break/>
                </td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="5">
                  <bold>Number of records eligible for full-text assessment (n=96)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Excluded by GPT-3.5</td>
                <td>2 (2)<sup>a</sup></td>
                <td>2 (2)<sup>a</sup></td>
                <td>0</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Excluded by GPT-4</td>
                <td>5 (0)<sup>a</sup></td>
                <td>2 (1)<sup>a</sup></td>
                <td>5 (3)<sup>a</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>Number of records for which exclusion was not justified.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>GPT-4 excluded 1287 records at the research design layer, 503 records at the target population layer, and 830 records at the intervention and control layer, ultimately determining 526 out of 3146 records for inclusion. None of the 9 records (including 9 papers) that were included in the meta-analysis were excluded by GPT-4. The sensitivity for included records was 0.875 and the specificity was 0.855. Among the eligible records for full-text assessment, GPT-4 classified 12 (12.5%) records as excluded. Of these, the exclusion of 8 records by GPT-4 was justified, while the remaining 4 (4.2%) records were considered to require full-text assessment (<xref ref-type="table" rid="table3">Table 3</xref>). After adjustments for these justified judgments (<xref ref-type="supplementary-material" rid="app2">Multimedia Appendix 2</xref>), the sensitivity improved to 0.943 and the specificity remained at 0.855. “For the one record that GPT-4 determined to be excluded at the target population layer, it was suggested that GPT-4 inferred that ‘although the record involved bipolar disorder, it did not match the selection criteria due to the presence of a comorbidity (alcohol dependence).’ For the three records that GPT-4 judged to be excluded at the Intervention and control layer, in each case, GPT-4 cited the reason for exclusion as ‘the intervention criteria are the addition of either SGA or MS to SGA or MS, but this study does not mention the use of SGA.’”</p>
        <p>In the list used in the paper by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>], there were a total of 355 records where part of the title and abstract were corrupted into irrelevant Chinese characters (eg, “This was an eight窶陣eek, open窶人abel, prospective study”). Despite these errors, all cases could be appropriately discerned, likely due to the context-sensitive judgment capability of GPT-3.5 and GPT-4.</p>
      </sec>
      <sec>
        <title>Comparison of GPT-3.5, GPT-4, and Human Evaluators</title>
        <p>Both the study by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>] and the study by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>] involved 2 individuals conducting screening for the initial segment, while a different set of 2 individuals was responsible for the screening of the latter segment. The sensitivity and specificity of human evaluators and GPT-3.5 and GPT-4 for each segment are shown in <xref ref-type="table" rid="table4">Table 4</xref>. The adjusted results, in cases where the exclusion of GPT-3.5 or GPT-4 was justified, are shown in the numbers within parentheses (<xref ref-type="table" rid="table4">Table 4</xref>).</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Comparison of evaluation metrics: GPT-3.5, GPT-4, and human evaluators.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="310"/>
            <col width="90"/>
            <col width="90"/>
            <col width="90"/>
            <col width="90"/>
            <col width="0"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td colspan="2">Screenings on Takeshima et al (2020) [<xref ref-type="bibr" rid="ref38">38</xref>]</td>
                <td colspan="5">Human evaluators</td>
                <td colspan="2">LLMs<sup>a</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">
                  <break/>
                </td>
                <td>1A</td>
                <td>2A</td>
                <td>3A</td>
                <td>4A</td>
                <td colspan="2">GPT-3.5</td>
                <td>GPT-4</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="9">
                  <bold>Initial segment (n=753)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sensitivity</td>
                <td>1.000</td>
                <td>0.867</td>
                <td>—<sup>b</sup></td>
                <td>—</td>
                <td colspan="2">0.800 (0.929)<sup>c</sup></td>
                <td>0.688 (1.000)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Specificity</td>
                <td>0.995</td>
                <td>0.996</td>
                <td>—</td>
                <td>—</td>
                <td colspan="2">0.702 (0.704)<sup>c</sup></td>
                <td>0.997 (0.997)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Latter segment (n=628)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sensitivity</td>
                <td>—</td>
                <td>—</td>
                <td>1.000</td>
                <td>1.000</td>
                <td colspan="2">1.000 (1.000)<sup>c</sup></td>
                <td>0.933 (0.933)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Specificity</td>
                <td>—</td>
                <td>—</td>
                <td>1.000</td>
                <td>0.997</td>
                <td colspan="2">0.718 (0.718)<sup>c</sup></td>
                <td>0.993 (0.993)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="2">Screenings on Maruki et al (2022) [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
                <td>Human evaluators</td>
                <td>Human evaluators</td>
                <td>Human evaluators</td>
                <td>Human evaluators</td>
                <td colspan="2">LLMs</td>
                <td>LLMs</td>
              </tr>
              <tr valign="top">
                <td colspan="2">Screenings on Maruki et al (2022) [<xref ref-type="bibr" rid="ref39">39</xref>]</td>
                <td>1B</td>
                <td>2B</td>
                <td>3B</td>
                <td>4B</td>
                <td colspan="2">GPT-3.5</td>
                <td>GPT-4</td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Initial segment (n = 1694)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sensitivity</td>
                <td>0.766</td>
                <td>0.979</td>
                <td>—</td>
                <td>—</td>
                <td colspan="2">0.936</td>
                <td>0.872 (0.952)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Specificity</td>
                <td>0.998</td>
                <td>0.998</td>
                <td>—</td>
                <td>—</td>
                <td colspan="2">0.129</td>
                <td>0.886 (0.886)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td colspan="9">
                  <bold>Latter segment (n=1452)</bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Sensitivity</td>
                <td>—</td>
                <td>—</td>
                <td>0.776</td>
                <td>0.939</td>
                <td colspan="2">0.980</td>
                <td>0.878 (0.935)<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>Specificity</td>
                <td>—</td>
                <td>—</td>
                <td>0.999</td>
                <td>0.999</td>
                <td colspan="2">0.100</td>
                <td>0.818 (0.819)<sup>c</sup></td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>LLMs: large language models.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>Not applicable.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>Values after adjusting for cases where exclusion was justified.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Time and Cost Required for Screenings</title>
        <p>In our Google Spreadsheet setup, both GPT-3.5 and GPT-4 managed to process approximately 110 records per minute across each of the 3 layers. Consequently, the estimated ideal completion time was between 20 and 30 minutes for the study by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>], and between 60 and 80 minutes for the study by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>]. However, in practice, due to errors with the Google Spreadsheet and API, the screening process took about 1 hour for the study by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>] and about 2 hours in total for the study by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>]. Furthermore, due to daily API call limits, the work had to be spread out over 3 days. The screening for these 2 studies incurred a total cost of US $59, with US $4 for calls to GPT-3.5 and US $55 for calls to GPT-4.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>This study demonstrates the use of a 3-layer screening method using GPT-3.5 and GPT-4 for title and abstract screenings in systematic reviews, highlighting its remarkable speed and sensitivity comparable with that of human evaluators. However, GPT-3.5 demonstrated low specificity for relevant records, rendering it less practical. In contrast, the use of GPT-4 showed both high sensitivity and specificity, particularly where adjustments for justified exclusions led to an improvement in sensitivity. Although achieving 100% sensitivity remained unattainable, a 3-layer screening method with GPT-4 may potentially be practical for use in the systematic review process and can reduce human labor.</p>
        <p>Previous research demonstrating the effectiveness of automated screening using text mining has encountered sensitivity issues [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. Specifically, the exclusion of important studies that should have been included in their meta-analysis [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref29">29</xref>], a limitation not observed in our approach, hampered their application to clinical practice. False negatives in machine learning–based screening can arise from several factors: complexity in research design, characteristics of the target demographic, types of interventions, complexity in selection criteria, a significant scarcity of relevant records within the data set (leading to data imbalance), and inconsistency in the terminology used for judgment [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref29">29</xref>]. Our method using GPT-3.5 or GPT-4 was able to address issues related to data set imbalance and terminology inconsistency, as we used the same prompt across records, and assess the inclusion or exclusion one by one. In addition, previous text mining screenings may not have effectively addressed garbled text, such as “open-label” mistakenly appearing as “open窶人abel” [<xref ref-type="bibr" rid="ref40">40</xref>], an issue that LLMs can potentially mitigate through their attention mechanisms [<xref ref-type="bibr" rid="ref41">41</xref>]. Moreover, the outstanding knowledge base of GPT-4 [<xref ref-type="bibr" rid="ref6">6</xref>,<xref ref-type="bibr" rid="ref7">7</xref>] likely helped address the complexity in research design, target demographics, and intervention, as well as selection criteria—areas where GPT-3.5 might have fallen short. These distinctions possibly account for the notable differences in specificity observed between GPT-3.5 and GPT-4. Recently, Guo et al [<xref ref-type="bibr" rid="ref33">33</xref>] conducted title and abstract screening using GPT-4. Their approach diverges from our 3-layer method; it integrated inclusion and exclusion criteria within the context, generating decisions and reasoning through a single prompt. While we believe that our 3-layer method could potentially offer greater sensitivity than theirs, it remains difficult to definitively assert a significant improvement in sensitivity over the method by Guo et al [<xref ref-type="bibr" rid="ref33">33</xref>], given the limited sample size and the differences in data sets. Tran and colleagues’ approach [<xref ref-type="bibr" rid="ref34">34</xref>], despite using GPT-3.5, demonstrated remarkable sensitivity. It is important to note, however, that the manual creation of their highly effective prompt raises questions regarding its replicability and broader applicability.</p>
        <p>Both human-conducted and LLM-conducted systematic reviews have their inherent pitfalls. Errors made by humans are inevitable, with their accuracy estimated to be around 10% [<xref ref-type="bibr" rid="ref15">15</xref>], and slightly higher for false exclusions, at approximately 13%-14% [<xref ref-type="bibr" rid="ref16">16</xref>,<xref ref-type="bibr" rid="ref17">17</xref>]. These values represent the performance of experts in the relevant field, and the accuracy may be lower for individuals with less expertise or shallow screening experience; therefore, guidelines have recommended piloting and training the abstract screening team [<xref ref-type="bibr" rid="ref12">12</xref>]. In this study, we observed that human evaluation in the paper by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>] exhibited slightly more false negatives than that in the paper by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>]. Although the reasons for the judgment discrepancies were not investigated in this study’s data set, they may be attributed to the larger volume of records screened [<xref ref-type="bibr" rid="ref14">14</xref>] and the potentially more complex and challenging research question in the paper by Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>]. Using 2 reviewers to screen records can significantly lower the likelihood of false negatives [<xref ref-type="bibr" rid="ref16">16</xref>] and has been recommended [<xref ref-type="bibr" rid="ref11">11</xref>,<xref ref-type="bibr" rid="ref13">13</xref>]. Yet, simultaneously, there has been a case that the systematic review screenings, albeit rare, are conducted by a single reviewer, because of time constraints [<xref ref-type="bibr" rid="ref13">13</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. Hence, the unavoidable errors and substantial time and effort required for screening represent significant drawbacks of human screening in systematic reviews [<xref ref-type="bibr" rid="ref10">10</xref>,<xref ref-type="bibr" rid="ref13">13</xref>].</p>
        <p>Conversely, methods using LLMs also present several drawbacks. One primary concern is their susceptibility to misinformation and quality issues inherent in their training data [<xref ref-type="bibr" rid="ref43">43</xref>]. Notably, in this study, the specificity of the GPT-3.5 screenings in Maruki et al [<xref ref-type="bibr" rid="ref39">39</xref>] paper was markedly low. While the causes are not definitive, this may be attributed to an insufficient understanding of bipolar disorder, MSs, and second-generation antipsychotics. Tran and colleagues [<xref ref-type="bibr" rid="ref34">34</xref>] incorporated relevant knowledge into their manually created prompts; it might have enhanced sensitivity but not specificity; and this could also be due to GPT-3.5’s knowledge limitations. Furthermore, the decision-making processes of LLMs lack transparency, making them difficult to interpret [<xref ref-type="bibr" rid="ref43">43</xref>]. This lack of interpretability is compounded by the “grounding problem,” where LLMs struggle to grasp concrete facts and real-world scenarios due to their lack of real-world experiences and sensory input [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref44">44</xref>]. We attempted to verify incorrectly excluded records by querying GPT-3.5 and GPT-4 with the original screening prompts, their responses, and justifications. Our findings revealed that GPT-3.5’s lower accuracy was primarily due to a lack of knowledge about the target domain, while GPT-4’s incorrect exclusions were mainly due to misinterpretations of the inclusion criteria. These findings highlight the ongoing challenges in understanding and interpreting the decision-making processes of LLMs. Although GPT-4 demonstrates advancements in comprehension, factuality, specificity, and inference, it is still more susceptible to factual errors [<xref ref-type="bibr" rid="ref45">45</xref>]. In addition, it has been suggested that LLMs’ accuracy diminishes with longer prompts [<xref ref-type="bibr" rid="ref46">46</xref>]; lengthy abstracts might have contributed to decreased accuracy in decision-making. A potential future risk is that the normalization of AI-based judgments could result in the oversight of human expert verification, potentially diminishing the quality of systematic reviews.</p>
        <p>On the positive side, compared with the human screening time reported in previous studies [<xref ref-type="bibr" rid="ref10">10</xref>], our method enabled remarkably faster screening. Although our approach uses a 3-layer structure, which might seem time-consuming at first glance, by limiting GPT-3.5/GPT-4 responses to “E” (Exclude) or “I” (Include), we efficiently screened a large volume of records in batch. Unlike humans, LLMs do not experience fatigue and subsequent decline in performance; moreover, they are presumed to have better reproducibility in their judgments. While using GPT-4’s API comes with associated costs [<xref ref-type="bibr" rid="ref36">36</xref>], the increased efficiency compared with human effort more than compensates for these expenses. Using LLMs for title and abstract screening could also enable screening a much larger number of records, previously deemed impractical due to time limitations. Our 3-layer method using GPT-4 exhibits high sensitivity and a useful level of specificity and yet opportunities for further refinement exist. Future studies could enhance accuracy through methods such as optimizing prompts [<xref ref-type="bibr" rid="ref47">47</xref>] and integrating multiple LLMs for decision assessment [<xref ref-type="bibr" rid="ref48">48</xref>], which may contribute to higher precision. In the meantime, swift advancements in LLM technology are set to continuously evolve; future breakthroughs in LLMs may readily overcome our current challenges—possibly, only by a simple prompt.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>This study has some limitations. First, the 2 systematic reviews used in this investigation [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>] were confined to clinical studies within psychiatry, limiting the generalizability of our findings. In addition, the sample size was small, and the investigation remained exploratory, with the results lacking statistical substantiation. Future studies should aim to replicate these findings across a broader range of medical fields and specialized domains to enhance their applicability and reliability. Second, the artificial intelligence industry is progressing rapidly, with information becoming obsolete within a matter of months or even weeks. The models we used in this study, gpt-3.5-turbo-0125 and gpt-4-0125-preview, are currently the most up-to-date. However, updates to these models might alter screening outcomes. Third, to ensure consistency in our findings, we set the temperature parameter to 0. However, a temperature of 0 does not always guarantee absolute uniformity in output sentences [<xref ref-type="bibr" rid="ref35">35</xref>]. However, our observations indicate no variation in results across multiple tests with the same model in this study. Fourth, this study did not investigate the discrepancies in screening results between GPT-3.5 and GPT-4, nor did it examine the impact of prompt variations on performance. In addition, this research did not directly compare the performance of the proposed approach with existing systematic literature review strategies. Furthermore, this study was not designed to explore the risks associated with using LLMs for screening purposes. Finally, gpt-3.5-turbo-0125’s training data include information up to September 2021, whereas gpt-4-0125-preview’s training data extend to December 2023 [<xref ref-type="bibr" rid="ref35">35</xref>]. Consequently, the systematic review paper by Takeshima et al [<xref ref-type="bibr" rid="ref38">38</xref>] might have been incorporated into GPT-3.5’s training data set, with both systematic review papers possibly included in GPT-4’s data set. Nevertheless, as the study’s prompts did not explicitly reference these reviews, we consider that their impact is minimal.</p>
      </sec>
      <sec>
        <title>Conclusions</title>
        <p>We developed a practical screening method using GPT-3.5 and GPT-4 in the title- and abstract-screening process of systematic reviews. Our 3-layer method not only achieved better sensitivity for relevant records than previous machine learning–based screening methods [<xref ref-type="bibr" rid="ref20">20</xref>,<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref23">23</xref>,<xref ref-type="bibr" rid="ref29">29</xref>] but also demonstrated a remarkable potential to reduce human reviewers’ workload significantly. Although GPT-3.5 showed lower specificity, which may limit its applicability, the use of GPT-4 within our method yielded sensitivity comparable with human evaluators, making it suitable for use in systematic review screenings. Despite the focus on psychiatric fields and the small sample size of our study, our findings highlight the potential for broader application. We emphasize the importance of further validation across multiple domains to establish a universal screening methodology. Concurrently, developing more effective approaches in response to the advancing capabilities of LLMs is warranted in future research.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Script for the Google Spreadsheet.</p>
        <media xlink:href="jmir_v26i1e52758_app1.docx" xlink:title="DOCX File , 23 KB"/>
      </supplementary-material>
      <supplementary-material id="app2">
        <label>Multimedia Appendix 2</label>
        <p>Records eligible for full paper screening but excluded by GPT-3.5 or GPT-4.</p>
        <media xlink:href="jmir_v26i1e52758_app2.xlsx" xlink:title="XLSX File  (Microsoft Excel File), 26 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">API</term>
          <def>
            <p>application programming interface</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">GPT</term>
          <def>
            <p>Generative Pre-trained Transformer</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">MS</term>
          <def>
            <p>mood stabilizers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">SGA</term>
          <def>
            <p>second-generation antipsychotics</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Japan Society for the Promotion of Science (JSPS) KAKENHI (grant 22K15778). During the preparation of this work, the authors used ChatGPT (GPT-4 and GPT-4o, by OpenAI), Claude (Claude 3 Opus, by Anthropic), and Gemini (Gemini 1.5 Pro, by Google) to enhance the readability and proofread the English text. After using these services, the authors reviewed and edited the content as needed and took full responsibility for the content of the publication.</p>
    </ack>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brown</surname>
              <given-names>TB</given-names>
            </name>
            <name name-style="western">
              <surname>Mann</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Ryder</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Subbiah</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kaplan</surname>
              <given-names>JD</given-names>
            </name>
            <name name-style="western">
              <surname>Dhariwal</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Neelakantan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shyam</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Sastry</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Herbert-Voss</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Krueger</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Henighan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Child</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Ramesh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ziegler</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Winter</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hesse</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sigler</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Litwin</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Gray</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Chess</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Clark</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Berner</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>McCandlish</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Amodei</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Language models are few-shot learners</article-title>
          <source>Adv Neural Inf Process Syst</source>
          <year>2020</year>
          <volume>33</volume>
          <fpage>1877</fpage>
          <lpage>1901</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ouyang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Almeida</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Wainwright</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Mishkin</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Agarwal</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Slama</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Ray</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Schulman</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hilton</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kelton</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Miller</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Simens</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Askell</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Welinder</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Christiano</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Leike</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Lowe</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Training language models to follow instructions with human feedback</article-title>
          <source>Adv Neural Inf Process Syst</source>
          <year>2022</year>
          <volume>35</volume>
          <fpage>27730</fpage>
          <lpage>27744</lpage>
          <pub-id pub-id-type="doi">10.48550/arXiv.2203.02155</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="web">
          <source>Introducing ChatGPT</source>
          <access-date>2023-07-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/blog/chatgpt">https://openai.com/blog/chatgpt</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levin</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Horesh</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Brezinov</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Meyer</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Performance of ChatGPT in medical examinations: a systematic review and a meta-analysis</article-title>
          <source>BJOG</source>
          <year>2024</year>
          <volume>131</volume>
          <issue>3</issue>
          <fpage>378</fpage>
          <lpage>380</lpage>
          <pub-id pub-id-type="doi">10.1111/1471-0528.17641</pub-id>
          <pub-id pub-id-type="medline">37604703</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="web">
          <source>GPT-4</source>
          <access-date>2024-02-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/research/gpt-4">https://openai.com/research/gpt-4</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bojic</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kovacevic</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Cabarkapa</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>GPT-4 surpassing human performance in linguistic pragmatics</article-title>
          <source>arXiv. Preprint posted online</source>
          <year>2023</year>
          <month>12</month>
          <day>15</day>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eriksen</surname>
              <given-names>AV</given-names>
            </name>
            <name name-style="western">
              <surname>Möller</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Ryg</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Use of GPT-4 to diagnose complex clinical cases</article-title>
          <source>NEJM AI</source>
          <year>2023</year>
          <volume>1</volume>
          <issue>1</issue>
          <fpage>AIp2300031</fpage>
          <pub-id pub-id-type="doi">10.1056/aip2300031</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>SG</given-names>
            </name>
          </person-group>
          <article-title>Using ChatGPT for language editing in scientific articles</article-title>
          <source>Maxillofac Plast Reconstr Surg</source>
          <year>2023</year>
          <volume>45</volume>
          <issue>1</issue>
          <fpage>13</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36882591"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s40902-023-00381-x</pub-id>
          <pub-id pub-id-type="medline">36882591</pub-id>
          <pub-id pub-id-type="pii">10.1186/s40902-023-00381-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC9992464</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Matsui</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Koda</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Yoshida</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Implications of nonhuman "Authors"</article-title>
          <source>JAMA</source>
          <year>2023</year>
          <volume>330</volume>
          <issue>6</issue>
          <fpage>566</fpage>
          <pub-id pub-id-type="doi">10.1001/jama.2023.10568</pub-id>
          <pub-id pub-id-type="medline">37552501</pub-id>
          <pub-id pub-id-type="pii">2807994</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lefebvre</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Glanville</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Briscoe</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Littlewood</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Marshall</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Metzendorf</surname>
              <given-names>MI</given-names>
            </name>
            <name name-style="western">
              <surname>Noel-Storr</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rader</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Shokraneh</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wieland</surname>
              <given-names>LS</given-names>
            </name>
          </person-group>
          <article-title>Searching for and selecting studies</article-title>
          <source>Cochrane Handbook for Systematic Reviews of Interventions</source>
          <year>2019</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>John Wiley &#38; Sons</publisher-name>
          <fpage>67</fpage>
          <lpage>107</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Higgins</surname>
              <given-names>JPT</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chandler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Cumpston</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Page</surname>
              <given-names>MJ</given-names>
            </name>
          </person-group>
          <source>Cochrane Handbook for Systematic Reviews of Interventions</source>
          <year>2019</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>John Wiley &#38; Sons</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Polanin</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Pigott</surname>
              <given-names>TD</given-names>
            </name>
            <name name-style="western">
              <surname>Espelage</surname>
              <given-names>DL</given-names>
            </name>
            <name name-style="western">
              <surname>Grotpeter</surname>
              <given-names>JK</given-names>
            </name>
          </person-group>
          <article-title>Best practice guidelines for abstract screening large‐evidence systematic reviews and meta‐analyses</article-title>
          <source>Res Synth Methods</source>
          <year>2019</year>
          <volume>10</volume>
          <issue>3</issue>
          <fpage>330</fpage>
          <lpage>342</lpage>
          <pub-id pub-id-type="doi">10.1002/jrsm.1354</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Page</surname>
              <given-names>MJ</given-names>
            </name>
            <name name-style="western">
              <surname>Moher</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bossuyt</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Boutron</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffmann</surname>
              <given-names>TC</given-names>
            </name>
            <name name-style="western">
              <surname>Mulrow</surname>
              <given-names>CD</given-names>
            </name>
            <name name-style="western">
              <surname>Shamseer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Tetzlaff</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Akl</surname>
              <given-names>EA</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Chou</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Glanville</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grimshaw</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Hróbjartsson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lalu</surname>
              <given-names>MM</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Loder</surname>
              <given-names>EW</given-names>
            </name>
            <name name-style="western">
              <surname>Mayo-Wilson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>McDonald</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>McGuinness</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Stewart</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Tricco</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Welch</surname>
              <given-names>VA</given-names>
            </name>
            <name name-style="western">
              <surname>Whiting</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>McKenzie</surname>
              <given-names>JE</given-names>
            </name>
          </person-group>
          <article-title>PRISMA 2020 explanation and elaboration: updated guidance and exemplars for reporting systematic reviews</article-title>
          <source>BMJ</source>
          <year>2021</year>
          <volume>372</volume>
          <fpage>n160</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.bmj.com/lookup/pmidlookup?view=long&#38;pmid=33781993"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmj.n160</pub-id>
          <pub-id pub-id-type="medline">33781993</pub-id>
          <pub-id pub-id-type="pmcid">PMC8005925</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Hearn</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>MacDonald</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Tsampalieros</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kadota</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Sandarage</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Jayawarden</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Datko</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Reynolds</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Bui</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sultan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sampson</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pratt</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Barrowman</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Nama</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Page</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>McNally</surname>
              <given-names>JD</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the relationship between citation set size, team size and screening methods used in systematic reviews: a cross-sectional study</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2021</year>
          <volume>21</volume>
          <issue>1</issue>
          <fpage>142</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-021-01335-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-021-01335-5</pub-id>
          <pub-id pub-id-type="medline">34238247</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-021-01335-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC8264476</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Nayfeh</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Tetzlaff</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>O'Blenis</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Murad</surname>
              <given-names>MH</given-names>
            </name>
          </person-group>
          <article-title>Error rates of human reviewers during abstract screening in systematic reviews</article-title>
          <source>PLoS One</source>
          <year>2020</year>
          <volume>15</volume>
          <issue>1</issue>
          <fpage>e0227742</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0227742"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0227742</pub-id>
          <pub-id pub-id-type="medline">31935267</pub-id>
          <pub-id pub-id-type="pii">PONE-D-19-26633</pub-id>
          <pub-id pub-id-type="pmcid">PMC6959565</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gartlehner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Affengruber</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Titscher</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Noel-Storr</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dooley</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Ballarini</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>König</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Single-reviewer abstract screening missed 13 percent of relevant studies: a crowd-based, randomized controlled trial</article-title>
          <source>J Clin Epidemiol</source>
          <year>2020</year>
          <volume>121</volume>
          <fpage>20</fpage>
          <lpage>28</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0895-4356(19)30982-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2020.01.005</pub-id>
          <pub-id pub-id-type="medline">31972274</pub-id>
          <pub-id pub-id-type="pii">S0895-4356(19)30982-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wilson</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Cruz</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Maclean</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ghanawi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>McCann</surname>
              <given-names>SK</given-names>
            </name>
            <name name-style="western">
              <surname>Brennan</surname>
              <given-names>PM</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sena</surname>
              <given-names>ES</given-names>
            </name>
            <name name-style="western">
              <surname>Macleod</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Screening for in vitro systematic reviews: a comparison of screening methods and training of a machine learning classifier</article-title>
          <source>Clin Sci (Lond)</source>
          <year>2023</year>
          <volume>137</volume>
          <issue>2</issue>
          <fpage>181</fpage>
          <lpage>193</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36630537"/>
          </comment>
          <pub-id pub-id-type="doi">10.1042/CS20220594</pub-id>
          <pub-id pub-id-type="medline">36630537</pub-id>
          <pub-id pub-id-type="pii">232436</pub-id>
          <pub-id pub-id-type="pmcid">PMC9885807</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bannach-Brown</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Przybyła</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rice</surname>
              <given-names>ASC</given-names>
            </name>
            <name name-style="western">
              <surname>Ananiadou</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liao</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Macleod</surname>
              <given-names>MR</given-names>
            </name>
          </person-group>
          <article-title>Machine learning algorithms for systematic review: reducing workload in a preclinical review of animal studies and reducing human screening error</article-title>
          <source>Syst Rev</source>
          <year>2019</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>23</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-019-0942-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-019-0942-7</pub-id>
          <pub-id pub-id-type="medline">30646959</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-019-0942-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC6334440</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cierco Jimenez</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Rosillo</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Cordova</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Cree</surname>
              <given-names>IA</given-names>
            </name>
            <name name-style="western">
              <surname>Gonzalez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Indave Ruiz</surname>
              <given-names>BI</given-names>
            </name>
          </person-group>
          <article-title>Machine learning computational tools to assist the performance of systematic reviews: a mapping review</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2022</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>322</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-022-01805-4"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-022-01805-4</pub-id>
          <pub-id pub-id-type="medline">36522637</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-022-01805-4</pub-id>
          <pub-id pub-id-type="pmcid">PMC9756658</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shemilt</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Simon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hollands</surname>
              <given-names>GJ</given-names>
            </name>
            <name name-style="western">
              <surname>Marteau</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Ogilvie</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>O'Mara-Eves</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>MP</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Pinpointing needles in giant haystacks: use of text mining to reduce impractical screening workload in extremely large scoping reviews</article-title>
          <source>Res Synth Methods</source>
          <year>2014</year>
          <volume>5</volume>
          <issue>1</issue>
          <fpage>31</fpage>
          <lpage>49</lpage>
          <pub-id pub-id-type="doi">10.1002/jrsm.1093</pub-id>
          <pub-id pub-id-type="medline">26054024</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rathbone</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Hoffmann</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Glasziou</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Faster title and abstract screening? Evaluating Abstrackr, a semi-automated online screening program for systematic reviewers</article-title>
          <source>Syst Rev</source>
          <year>2015</year>
          <volume>4</volume>
          <fpage>80</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-015-0067-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-015-0067-6</pub-id>
          <pub-id pub-id-type="medline">26073974</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-015-0067-6</pub-id>
          <pub-id pub-id-type="pmcid">PMC4472176</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Olofsson</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Brolund</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hellberg</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Silverstein</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Stenström</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Österberg</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Dagerhamn</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Can abstract screening workload be reduced using text mining? User experiences of the tool Rayyan</article-title>
          <source>Res Synth Methods</source>
          <year>2017</year>
          <volume>8</volume>
          <issue>3</issue>
          <fpage>275</fpage>
          <lpage>280</lpage>
          <pub-id pub-id-type="doi">10.1002/jrsm.1237</pub-id>
          <pub-id pub-id-type="medline">28374510</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gates</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hartling</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Technology-assisted title and abstract screening for systematic reviews: a retrospective evaluation of the Abstrackr machine learning tool</article-title>
          <source>Syst Rev</source>
          <year>2018</year>
          <volume>7</volume>
          <issue>1</issue>
          <fpage>45</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-018-0707-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-018-0707-8</pub-id>
          <pub-id pub-id-type="medline">29530097</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-018-0707-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC5848519</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gartlehner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Lux</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Affengruber</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Dobrescu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaminski-Hartenthaler</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Viswanathan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Assessing the accuracy of machine-assisted abstract screening with DistillerAI: a user study</article-title>
          <source>Syst Rev</source>
          <year>2019</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>277</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-019-1221-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-019-1221-3</pub-id>
          <pub-id pub-id-type="medline">31727159</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-019-1221-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC6857277</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gates</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Gates</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sebastianski</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Guitard</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Elliott</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Hartling</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>The semi-automation of title and abstract screening: a retrospective exploration of ways to leverage abstrackr's relevance predictions in systematic and rapid reviews</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2020</year>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>139</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-020-01031-w"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-020-01031-w</pub-id>
          <pub-id pub-id-type="medline">32493228</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-020-01031-w</pub-id>
          <pub-id pub-id-type="pmcid">PMC7268596</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hamel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kelly</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Thavorn</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Rice</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Wells</surname>
              <given-names>GA</given-names>
            </name>
            <name name-style="western">
              <surname>Hutton</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>An evaluation of DistillerSR's machine learning-based prioritization tool for title/abstract screening—impact on reviewer-relevant outcomes</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2020</year>
          <volume>20</volume>
          <issue>1</issue>
          <fpage>256</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-020-01129-1"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-020-01129-1</pub-id>
          <pub-id pub-id-type="medline">33059590</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-020-01129-1</pub-id>
          <pub-id pub-id-type="pmcid">PMC7559198</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Patel</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Weyrich</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fenton</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Viswanathan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Comparison of a traditional systematic review approach with review-of-reviews and semi-automation as strategies to update the evidence</article-title>
          <source>Syst Rev</source>
          <year>2020</year>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>243</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-020-01450-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-020-01450-2</pub-id>
          <pub-id pub-id-type="medline">33076975</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-020-01450-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC7574591</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pham</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Jovanovic</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bagheri</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Antony</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ashoor</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Nguyen</surname>
              <given-names>TT</given-names>
            </name>
            <name name-style="western">
              <surname>Rios</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Robson</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>SM</given-names>
            </name>
            <name name-style="western">
              <surname>Watt</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Straus</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Tricco</surname>
              <given-names>AC</given-names>
            </name>
          </person-group>
          <article-title>Text mining to support abstract screening for knowledge syntheses: a semi-automated workflow</article-title>
          <source>Syst Rev</source>
          <year>2021</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>156</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-021-01700-x"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-021-01700-x</pub-id>
          <pub-id pub-id-type="medline">34039433</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-021-01700-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC8152711</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Valizadeh</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Moassefi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nakhostin-Ansari</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hosseini Asl</surname>
              <given-names>SH</given-names>
            </name>
            <name name-style="western">
              <surname>Saghab Torbati</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aghajani</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Maleki Ghorbani</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Faghani</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Abstract screening using the automated tool rayyan: results of effectiveness in three diagnostic test accuracy systematic reviews</article-title>
          <source>BMC Med Res Methodol</source>
          <year>2022</year>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>160</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedresmethodol.biomedcentral.com/articles/10.1186/s12874-022-01631-8"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12874-022-01631-8</pub-id>
          <pub-id pub-id-type="medline">35655155</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12874-022-01631-8</pub-id>
          <pub-id pub-id-type="pmcid">PMC9161508</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>O'Connor</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Tsafnat</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Glasziou</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gilbert</surname>
              <given-names>SB</given-names>
            </name>
            <name name-style="western">
              <surname>Hutton</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>A question of trust: can we build an evidence base to gain trust in systematic review automation technologies?</article-title>
          <source>Syst Rev</source>
          <year>2019</year>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>143</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://systematicreviewsjournal.biomedcentral.com/articles/10.1186/s13643-019-1062-0"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13643-019-1062-0</pub-id>
          <pub-id pub-id-type="medline">31215463</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13643-019-1062-0</pub-id>
          <pub-id pub-id-type="pmcid">PMC6582554</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kohandel Gargari</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Mahmoudi</surname>
              <given-names>MH</given-names>
            </name>
            <name name-style="western">
              <surname>Hajisafarali</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Samiee</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Enhancing title and abstract screening for systematic reviews with GPT-3.5 turbo</article-title>
          <source>BMJ Evid Based Med</source>
          <year>2024</year>
          <volume>29</volume>
          <issue>1</issue>
          <fpage>69</fpage>
          <lpage>70</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://ebm.bmj.com/lookup/pmidlookup?view=long&#38;pmid=37989538"/>
          </comment>
          <pub-id pub-id-type="doi">10.1136/bmjebm-2023-112678</pub-id>
          <pub-id pub-id-type="medline">37989538</pub-id>
          <pub-id pub-id-type="pii">bmjebm-2023-112678</pub-id>
          <pub-id pub-id-type="pmcid">PMC10850650</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Khraisha</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Put</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kappenberg</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Warraitch</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hadfield</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Can large language models replace humans in systematic reviews? Evaluating GPT-4's efficacy in screening and extracting data from peer-reviewed and grey literature in multiple languages</article-title>
          <source>Res Synth Methods</source>
          <year>2024</year>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>616</fpage>
          <lpage>626</lpage>
          <pub-id pub-id-type="doi">10.1002/jrsm.1715</pub-id>
          <pub-id pub-id-type="medline">38484744</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Guo</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Deng</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Paget</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Naugler</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Automated paper screening for clinical reviews using large language models: data analysis study</article-title>
          <source>J Med Internet Res</source>
          <year>2024</year>
          <volume>26</volume>
          <fpage>e48996</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jmir.org/2024//e48996/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/48996</pub-id>
          <pub-id pub-id-type="medline">38214966</pub-id>
          <pub-id pub-id-type="pii">v26i1e48996</pub-id>
          <pub-id pub-id-type="pmcid">PMC10818236</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tran</surname>
              <given-names>VT</given-names>
            </name>
            <name name-style="western">
              <surname>Gartlehner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Yaacoub</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Boutron</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Schwingshackl</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Stadelmaier</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sommer</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Alebouyeh</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Afach</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Meerpohl</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Ravaud</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Sensitivity and specificity of using GPT-3.5 turbo models for title and abstract screening in systematic reviews and meta-analyses</article-title>
          <source>Ann Intern Med</source>
          <year>2024</year>
          <volume>177</volume>
          <issue>6</issue>
          <fpage>791</fpage>
          <lpage>799</lpage>
          <pub-id pub-id-type="doi">10.7326/M23-3389</pub-id>
          <pub-id pub-id-type="medline">38768452</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="web">
          <source>Models</source>
          <access-date>2024-02-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://platform.openai.com/docs/models">https://platform.openai.com/docs/models</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="web">
          <source>Pricing</source>
          <access-date>2024-03-06</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/pricing">https://openai.com/pricing</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="web">
          <source>API Reference</source>
          <access-date>2024-02-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://platform.openai.com/docs/api-reference/">https://platform.openai.com/docs/api-reference/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Takeshima</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Utsumi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Aoki</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Suzuki</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Okajima</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Watanabe</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Watanabe</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Takaesu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Efficacy and safety of bright light therapy for manic and depressive symptoms in patients with bipolar disorder: a systematic review and meta-analysis</article-title>
          <source>Psychiatry Clin Neurosci</source>
          <year>2020</year>
          <volume>74</volume>
          <issue>4</issue>
          <fpage>247</fpage>
          <lpage>256</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31917880"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/pcn.12976</pub-id>
          <pub-id pub-id-type="medline">31917880</pub-id>
          <pub-id pub-id-type="pmcid">PMC7187384</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maruki</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Utsumi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Takeshima</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Fujiwara</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Matsui</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Aoki</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Toda</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Watanabe</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Watanabe</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Takaesu</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Efficacy and safety of adjunctive therapy to lamotrigine, lithium, or valproate monotherapy in bipolar depression: a systematic review and meta-analysis of randomized controlled trials</article-title>
          <source>Int J Bipolar Disord</source>
          <year>2022</year>
          <volume>10</volume>
          <issue>1</issue>
          <fpage>24</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36269465"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s40345-022-00271-7</pub-id>
          <pub-id pub-id-type="medline">36269465</pub-id>
          <pub-id pub-id-type="pii">10.1186/s40345-022-00271-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC9587199</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Benchimol</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kazinnik</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Saadon</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Text mining methodologies with R: An application to central bank texts</article-title>
          <source>Machine Learn with Appl</source>
          <year>2022</year>
          <volume>8</volume>
          <fpage>100286</fpage>
          <pub-id pub-id-type="doi">10.1016/j.mlwa.2022.100286</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>AN</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>Adv Neural Inf Process syst</source>
          <year>2017</year>
          <volume>30</volume>
          <fpage>5998</fpage>
          <lpage>6008</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nussbaumer-Streit</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Mayr</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Dobrescu</surname>
              <given-names>AI</given-names>
            </name>
            <name name-style="western">
              <surname>Chapman</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Persad</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Klerings</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Wagner</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Siebert</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Christof</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Zachariah</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Gartlehner</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Quarantine alone or in combination with other public health measures to control COVID-19: a rapid review</article-title>
          <source>Cochrane Database Syst Rev</source>
          <year>2020</year>
          <volume>4</volume>
          <issue>4</issue>
          <fpage>CD013574</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32267544"/>
          </comment>
          <pub-id pub-id-type="doi">10.1002/14651858.CD013574</pub-id>
          <pub-id pub-id-type="medline">32267544</pub-id>
          <pub-id pub-id-type="pmcid">PMC7141753</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Pei</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tiwari</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Pre-trained language models in biomedical domain: a systematic survey</article-title>
          <source>ACM Comput Surv</source>
          <year>2023</year>
          <volume>56</volume>
          <issue>3</issue>
          <fpage>1</fpage>
          <lpage>52</lpage>
          <pub-id pub-id-type="doi">10.1145/3611651</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Mollo</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Millière</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>The vector grounding problem</article-title>
          <source>arXiv. Preprint posted online</source>
          <year>2023</year>
          <month>04</month>
          <day>04</day>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Chang KC-C. why does chatgpt fall short in providing truthful answers</article-title>
          <source>arXiv. Preprint posted online</source>
          <year>2023</year>
          <month>12</month>
          <day>03</day>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Levy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Jacoby</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Goldberg</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Same task, more tokens: the impact of input length on the reasoning performance of large language models</article-title>
          <source>arXiv. Preprint posted online</source>
          <year>2024</year>
          <month>07</month>
          <day>10</day>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Giray</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Prompt engineering with ChatGPT: a guide for academic writers</article-title>
          <source>Ann Biomed Eng</source>
          <year>2023</year>
          <volume>51</volume>
          <issue>12</issue>
          <fpage>2629</fpage>
          <lpage>2633</lpage>
          <pub-id pub-id-type="doi">10.1007/s10439-023-03272-4</pub-id>
          <pub-id pub-id-type="medline">37284994</pub-id>
          <pub-id pub-id-type="pii">10.1007/s10439-023-03272-4</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Ye</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>More agents is all you need</article-title>
          <source>arXiv. Preprint posted online</source>
          <year>2024</year>
          <month>02</month>
          <day>03</day>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
