<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article article-type="research-article" dtd-version="2.0" xmlns:xlink="http://www.w3.org/1999/xlink">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v26i1e52935</article-id>
      <article-id pub-id-type="pmid">38578685</article-id>
      <article-id pub-id-type="doi">10.2196/52935</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Evaluation of Large Language Model Performance and Reliability for Citations and References in Scholarly Writing: Cross-Disciplinary Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Mavragani</surname>
            <given-names>Amaryllis</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Bu</surname>
            <given-names>Yi</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Li</surname>
            <given-names>Wenhao</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Liu</surname>
            <given-names>Ivan</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Mihalache</surname>
            <given-names>Andrew</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Mugaanyi</surname>
            <given-names>Joseph</given-names>
          </name>
          <degrees>MBBS, MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-1688-5475</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author" equal-contrib="yes">
          <name name-style="western">
            <surname>Cai</surname>
            <given-names>Liuying</given-names>
          </name>
          <degrees>MPhil</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0005-2648-1839</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Cheng</surname>
            <given-names>Sumei</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0009-0000-3638-4171</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author">
          <name name-style="western">
            <surname>Lu</surname>
            <given-names>Caide</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9588-2218</ext-link>
        </contrib>
        <contrib id="contrib5" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>Huang</surname>
            <given-names>Jing</given-names>
          </name>
          <degrees>MD, PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <address>
            <institution>Department of Hepato-Pancreato-Biliary Surgery, Ningbo Medical Center Lihuili Hospital</institution>
            <institution>Health Science Center</institution>
            <institution>Ningbo University</institution>
            <addr-line>No 1111 Jiangnan Road</addr-line>
            <addr-line>Ningbo, 315000</addr-line>
            <country>China</country>
            <phone>86 13819803591</phone>
            <email>huangjingonline@163.com</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3245-3605</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>Department of Hepato-Pancreato-Biliary Surgery, Ningbo Medical Center Lihuili Hospital</institution>
        <institution>Health Science Center</institution>
        <institution>Ningbo University</institution>
        <addr-line>Ningbo</addr-line>
        <country>China</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>Institute of Philosophy</institution>
        <institution>Shanghai Academy of Social Sciences</institution>
        <addr-line>Shanghai</addr-line>
        <country>China</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Jing Huang <email>huangjingonline@163.com</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2024</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>5</day>
        <month>4</month>
        <year>2024</year>
      </pub-date>
      <volume>26</volume>
      <elocation-id>e52935</elocation-id>
      <history>
        <date date-type="received">
          <day>19</day>
          <month>9</month>
          <year>2023</year>
        </date>
        <date date-type="rev-request">
          <day>8</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="rev-recd">
          <day>14</day>
          <month>12</month>
          <year>2023</year>
        </date>
        <date date-type="accepted">
          <day>12</day>
          <month>3</month>
          <year>2024</year>
        </date>
      </history>
      <copyright-statement>©Joseph Mugaanyi, Liuying Cai, Sumei Cheng, Caide Lu, Jing Huang. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 05.04.2024.</copyright-statement>
      <copyright-year>2024</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research, is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2024/1/e52935" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Large language models (LLMs) have gained prominence since the release of ChatGPT in late 2022.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>The aim of this study was to assess the accuracy of citations and references generated by ChatGPT (GPT-3.5) in two distinct academic domains: the natural sciences and humanities.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>Two researchers independently prompted ChatGPT to write an introduction section for a manuscript and include citations; they then evaluated the accuracy of the citations and Digital Object Identifiers (DOIs). Results were compared between the two disciplines.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>Ten topics were included, including 5 in the natural sciences and 5 in the humanities. A total of 102 citations were generated, with 55 in the natural sciences and 47 in the humanities. Among these, 40 citations (72.7%) in the natural sciences and 36 citations (76.6%) in the humanities were confirmed to exist (<italic>P</italic>=.42). There were significant disparities found in DOI presence in the natural sciences (39/55, 70.9%) and the humanities (18/47, 38.3%), along with significant differences in accuracy between the two disciplines (18/55, 32.7% vs 4/47, 8.5%). DOI hallucination was more prevalent in the humanities (42/55, 89.4%). The Levenshtein distance was significantly higher in the humanities than in the natural sciences, reflecting the lower DOI accuracy.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>ChatGPT’s performance in generating citations and references varies across disciplines. Differences in DOI standards and disciplinary nuances contribute to performance variations. Researchers should consider the strengths and limitations of artificial intelligence writing tools with respect to citation accuracy. The use of domain-specific models may enhance accuracy.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>large language models</kwd>
        <kwd>accuracy</kwd>
        <kwd>academic writing</kwd>
        <kwd>AI</kwd>
        <kwd>cross-disciplinary evaluation</kwd>
        <kwd>scholarly writing</kwd>
        <kwd>ChatGPT</kwd>
        <kwd>GPT-3.5</kwd>
        <kwd>writing tool</kwd>
        <kwd>scholarly</kwd>
        <kwd>academic discourse</kwd>
        <kwd>LLMs</kwd>
        <kwd>machine learning algorithms</kwd>
        <kwd>NLP</kwd>
        <kwd>natural language processing</kwd>
        <kwd>citations</kwd>
        <kwd>references</kwd>
        <kwd>natural science</kwd>
        <kwd>humanities</kwd>
        <kwd>chatbot</kwd>
        <kwd>artificial intelligence</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <sec>
        <title>Background</title>
        <p>In the ever-evolving landscape of scholarly research and academic discourse, the role of technology in aiding and enhancing the research process has grown exponentially. One of the most notable advancements in this regard is the emergence of large language models (LLMs) such as GPT-3.5, which have demonstrated impressive capabilities in generating written content across various domains, including academic writing. These LLMs, powered by vast corpora of text data and sophisticated machine-learning algorithms, have offered researchers and writers a new tool for assistance in crafting scholarly documents [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref3">3</xref>]. LLMs were initially designed and developed to primarily assist in natural language writing. However, since the release of ChatGPT in late 2022, the tool has been adopted in a wide range of scenarios, including customer care, expert systems, as well as literature searches and academic writing. Researchers have already used LLMs to write their academic papers, as demonstrated by Kishony and Ifargan [<xref ref-type="bibr" rid="ref4">4</xref>]. While the potential of these tools is evident, it is essential to critically assess their performance, especially in the intricate domains of citations and references, which are the foundation of academic discourse and credibility.</p>
        <p>Citations and references serve as the backbone of scholarly communication, providing the necessary context, evidence, and credit to prior works, thus fostering intellectual dialogue and ensuring the integrity of the research process. Accuracy in generating citations and the inclusion of Digital Object Identifiers (DOIs) [<xref ref-type="bibr" rid="ref5">5</xref>] are paramount, as they directly influence the traceability and accessibility of cited works. Despite the promise of LLMs, concerns have emerged regarding the reliability and precision of their generated citations and references, raising questions about their suitability as academic writing assistants. Studies on the viability of LLMs as writing assistants in scholarly writing [<xref ref-type="bibr" rid="ref6">6</xref>-<xref ref-type="bibr" rid="ref8">8</xref>] underscore the significance of this body of research within the broader academic landscape. Although prior works are quite informative [<xref ref-type="bibr" rid="ref9">9</xref>-<xref ref-type="bibr" rid="ref12">12</xref>], there is a lack of an interdisciplinary perspective on citations and references generated by LLMs, which is vital for understanding how LLMs perform across different disciplines.</p>
        <p>An increasing number of academics and researchers, especially in countries where English is not a first language (eg, China), are relying on ChatGPT to translate their work into English, research the existing published literature, and even generate citations and references to published literature. Therefore, the aim of this study was to evaluate LLM performance in generating citations and references across two distinct domains, the natural sciences and humanities, by assessing both the presence and accuracy of citations, the existence and accuracy of DOIs, and the potential for hallucination. We aim to provide valuable insights into the strengths and limitations of LLMs in supporting academic writing in diverse research contexts.</p>
        <p>The outcomes of this study will contribute to a nuanced understanding of the capabilities and limitations of LLMs as academic writing assistants. Moreover, our findings may inform best practices for researchers and writers who employ these tools in their work, fostering transparency and accuracy in scholarly communication.</p>
      </sec>
      <sec>
        <title>LLM Concepts</title>
        <p>An LLM is a catch-all term for a machine-learning model designed and trained to understand and generate natural language. LLMs are considered “large” language models due to the sheer number of parameters in the model. A parameter in machine learning is a numerical variable or weight that is optimized through training to map a relationship between the input and the output. LLMs have millions to billions of parameters.</p>
        <p>Current LLMs are mostly based on the transformer architecture (<xref rid="figure1" ref-type="fig">Figure 1</xref>). However, before transformers were introduced in 2017 [<xref ref-type="bibr" rid="ref13">13</xref>], recurrent neural nets (RNNs) were mostly used for natural language processing. One key limitation of RNNs was the length of text they could handle. In 2015, Bahdanau et al [<xref ref-type="bibr" rid="ref14">14</xref>] proposed accounting for attention to improve RNN performance with long text. Drawing inspiration for the RNN’s encoder-decoder design, the transformer consists of an encoder and a decoder; however, unlike the RNN, the transformer does not perform sequential data processing and each layer can address all other layers. This allows the transformer model to handle different parts of the input as it processes each part at different stages. This is the mechanism that allows for self-attention in the transformer model.</p>
        <p>The way attention works in a transformer model is by computing attention weights for each token, and then the relevance of the token is determined based on the weights. This allows the model to track and assign hierarchical values to each token. Fundamentally, this is similar to how humans process language by extracting the key details out of a chunk of text. This architecture is the linchpin for the majority of LLMs, including the GPT model [<xref ref-type="bibr" rid="ref15">15</xref>] that is the basis of OpenAI’s ChatGPT or the bidirectional encoder representations from transformers (BERT) algorithm [<xref ref-type="bibr" rid="ref16">16</xref>]. These are broadly categorized into encoder-style and decoder-style transformers, with the former mostly applying to predictive tasks and the latter applying to generative tasks.</p>
        <p>Irrespective of the architecture, as an encoder-style or decoder-style transformer, the model is trained on a vast volume of data. The objective is to train a model capable of applying the knowledge gained from the training data to unseen data or situations. This is referred to as generalization. If the model is capable of precise recall of data it has previously been exposed to, this would be memorization and overfitting is said to have occurred. However, this does not mean that memorization is in itself a negative feature. Indeed, there are situations where memorization is preferable to generation such as in the task of information cataloging.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Transformer model architecture (left) and GPT architecture (right).</p>
          </caption>
          <graphic xlink:href="jmir_v26i1e52935_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>LLMs in Academia</title>
        <p>LLMs can handle tasks such as text classification, translation, summarization, and text generation. Since the advent of the internet, and with it the publication of scientific information online, the amount of global academic output exploded, with more than 5 million articles published in 2022 (<xref ref-type="table" rid="table1">Table 1</xref>). Given the pressure in academia to keep up with developments in one’s field, it is increasingly becoming more difficult to track, prioritize, and keep up with scientific information. It is against this backdrop that LLMs offer an opportunity. Perhaps the most obvious use case is in literature reviews and summarization, reference lookup, and data generation.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Number of academic papers published per year, 2018-2022.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="500"/>
            <col width="500"/>
            <thead>
              <tr valign="top">
                <td>Year</td>
                <td>Number of articles published (millions)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>2018</td>
                <td>4.18</td>
              </tr>
              <tr valign="top">
                <td>2019</td>
                <td>4.43</td>
              </tr>
              <tr valign="top">
                <td>2020</td>
                <td>4.68</td>
              </tr>
              <tr valign="top">
                <td>2021</td>
                <td>5.03</td>
              </tr>
              <tr valign="top">
                <td>2022</td>
                <td>5.14</td>
              </tr>
            </tbody>
          </table>
        </table-wrap>
        <p>However, there are still several questions that need to be answered. First, machine-learning models are inherently probabilistic, meaning that they are not deterministic. Therefore, for the same user input, the model may give different results due to the variability baked into the model. While this can be a valuable trait for creative endeavors, in academic and scientific works, there is a need for reproducibility and reliability, and it remains unclear how well this can be achieved. Second, LLMs are constrained to the information they are trained on. This can be affected by selection bias, the quality of data used, artifacts resulting from data cleaning, and other factors. In essence, we rely on trusting the trainer to provide accurate and unbiased training data to the models.</p>
        <p>There is potential for LLMs to be useful tools for delivering academic and scientific information to various audiences, including—but not limited to—students and other academics. However, for this use case, a degree of memorization of the underlying content is necessary. Where information is unviable, it would be better to state so rather than to interpolate. In the current iteration of LLMs, since the training is geared toward generalization and the models are probabilistic, they tend to interpolate and fill in the missing information with synthetic text. There is still a need to explore this process deeper to find solutions.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Data Collection and Validation</title>
        <p>Topics were selected and categorized as either natural sciences or humanities. Topics were included if they were: (1) clinical or biomedical–related research in the natural sciences category and philosophy/psychology-related research in the humanities category, and (2) published in English. Topics were excluded if they were: (1) not in English, (2) related to a highly specialized or niche field, and (3) sensitive or controversial in nature. Two researchers independently prompted ChatGPT (GPT-3.5) to write sections of a manuscript while adhering to the American Psychological Association style [<xref ref-type="bibr" rid="ref17">17</xref>] for citations and including the DOI of each reference. Citations and references generated by ChatGPT were collected for subsequent analysis. The researchers then independently validated the references by conducting searches on Google Scholar, PubMed, and Google Search for each cited reference. The primary objective was to confirm the existence and accuracy of the cited literature. DOI existence and validation were confirmed using the DOI Foundation website [<xref ref-type="bibr" rid="ref18">18</xref>]. DOIs that did not exist or were matched to a different source were considered hallucinations [<xref ref-type="bibr" rid="ref19">19</xref>]. Data collected by both researchers were aggregated and compared. Independent validation was performed to ensure agreement between the two researchers regarding the existence, validity, and accuracy of the citations and DOIs. Any disagreements or discrepancies were resolved through discussion and consensus.</p>
        <p>In this study, hallucination refers to instances where ChatGPT 3.5 generates DOIs and/or citations that do not correspond to actual, valid DOIs/citations for scholarly references. In these instances, the model may produce DOIs and/or citations that seem authentic but are in fact incorrect or nonexistent. The Levenshtein distance, also known as the edit distance, is a measure of the similarity between two strings by calculating the minimum number of single-character edits (insertions, deletions, or substitutions) required to transform one string into the other. In other words, this metric quantifies the “distance” between two strings in terms of the minimum number of operations needed to make them identical. We used the Levenshtein distance to compare the DOI generated by ChatGPT with the correct DOI. This comparison helps to measure how closely the artificial intelligence (AI)–generated DOI aligns with the expected DOI for a given citation. By calculating the Levenshtein distance, we can quantify the differences between the AI-generated DOI and the correct DOI. Larger Levenshtein distance values suggest greater dissimilarity, indicating potential inaccuracies in the AI-generated DOI.</p>
      </sec>
      <sec>
        <title>Statistical Analysis</title>
        <p>Data analysis was conducted using SPSS 26 and Python. The Levenshtein distance [<xref ref-type="bibr" rid="ref20">20</xref>] between the generated DOI and the actual DOI was calculated using the <italic>thefuzz</italic> package in Python to quantitatively assess the DOI accuracy. Continuous variables are reported as mean (SD) and categorical variables are presented as absolute numbers and percentages. An independent-sample <italic>t</italic> test was used to compare continuous variables, whereas the Fisher exact test was used for comparisons of categorical variables. A <italic>P</italic> value &lt;.05 was considered statistically significant in all tests.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This study was exempt from ethical review since no animal or human participants were involved.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Included Topics and Citations</title>
        <p>Ten manuscript topics were selected and included in the study, with 5 in the natural sciences group and 5 in the humanities group. ChatGPT 3.5 was prompted to write an introduction section for each topic between July 10 and August 15, 2023. A total of 102 citations were generated by ChatGPT. Of these, 55 were in the natural sciences group and 47 in the humanities group. The existence, validity, and relevance of citations were examined irrespective of the corresponding DOIs. The results are summarized in <xref ref-type="table" rid="table2">Table 2</xref>. A list of the included topics and a sample of prompts to ChatGPT are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Data analysis results.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="270"/>
            <col width="240"/>
            <col width="240"/>
            <col width="250"/>
            <thead>
              <tr valign="bottom">
                <td>Variables</td>
                <td>Natural sciences (n=55)</td>
                <td>Humanities (n=47)</td>
                <td><italic>P</italic> value<sup>a</sup></td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>Citation exists, n (%)</td>
                <td>40 (72.7)</td>
                <td>36 (76.6)</td>
                <td>.42</td>
              </tr>
              <tr valign="top">
                <td>Citation accurate, n (%)</td>
                <td>37 (67.3)</td>
                <td>29 (61.7)</td>
                <td>.35</td>
              </tr>
              <tr valign="top">
                <td>Relevant, n (%)</td>
                <td>39 (70.9)</td>
                <td>35 (74.5)</td>
                <td>.43</td>
              </tr>
              <tr valign="top">
                <td>DOI<sup>b</sup> exists, n (%)</td>
                <td>39 (70.9)</td>
                <td>18 (38.3)</td>
                <td>.001</td>
              </tr>
              <tr valign="top">
                <td>DOI accurate, n (%)</td>
                <td>18 (32.7)</td>
                <td>4 (8.5)</td>
                <td>.003</td>
              </tr>
              <tr valign="top">
                <td>DOI hallucination, n (%)</td>
                <td>34 (61.8)</td>
                <td>42 (89.4)</td>
                <td>.001</td>
              </tr>
              <tr valign="top">
                <td>Levenshtein distance, mean (SD)</td>
                <td>64.13 (42.26)</td>
                <td>42.15 (40.23)</td>
                <td>.009</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Categorical variables were compared using the Fisher exact test; the continuous variable (Levenshtein distance) was compared using the independent-sample <italic>t</italic> test.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>DOI: Digital Object Identifier.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
      </sec>
      <sec>
        <title>Citation Existence and Accuracy</title>
        <p>Of the 102 generated citations, 76 (74.5%) were found to be real and exist in the published literature, with 72.7% and 76.6% of the citations verified in the natural and humanities group, respectively. There was no significant difference between the two groups (<italic>P</italic>=.42), indicating that the validity of the citations was relatively consistent between the two domains. Similarly, when assessing the accuracy of the citations, no significant difference was observed (<xref ref-type="table" rid="table2">Table 2</xref>).</p>
      </sec>
      <sec>
        <title>Citation Relevance</title>
        <p>The relevance of citations generated by ChatGPT was evaluated by assessing whether they were appropriate and contextually meaningful within the research topics. Our analysis indicated that 70.9% and 74.5% of citations in the natural sciences and humanities categories were deemed relevant, respectively (<xref ref-type="table" rid="table2">Table 2</xref>). The difference was not statistically significant (<italic>P</italic>=.43), suggesting that ChatGPT demonstrated a similar ability to generate contextually relevant citations in both domains.</p>
      </sec>
      <sec>
        <title>DOI Existence, Accuracy, and Hallucination</title>
        <p>Our analysis revealed significant differences between the two domains with respect to DOIs. In the natural sciences, 70.9% of the included DOIs were real, whereas in the humanities, only 38.3% of the DOIs generated were real (<italic>P</italic>=.001; <xref ref-type="table" rid="table2">Table 2</xref>). Similarly, the level of DOI accuracy was significantly higher for the natural sciences than for the humanities (<italic>P</italic>=.003). Moreover, the occurrence of DOI hallucination, where ChatGPT generates DOIs that do not correspond with the existing literature, was more prevalent in the humanities than in the natural sciences (<italic>P</italic>=.001). The mean Levenshtein distance, which measures the deviation between the generated DOI and the actual DOI, was significantly higher in the natural sciences group than in the humanities (<italic>P</italic>=.009; <xref ref-type="table" rid="table2">Table 2</xref>).</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>The results of this study shed light on the performance of ChatGPT (GPT-3.5) as an academic writing assistant in generating citations and references in natural sciences and humanities topics. Our findings reveal notable differences in the accuracy and reliability of the citations and references generated by ChatGPT when applied to natural sciences and humanities topics. Hallucination in the context of LLMs such as ChatGPT refers to a phenomenon where the model generates content that is incorrect, fabricated, or not grounded in reality. Hallucination occurs when the model produces information that appears plausible or contextually relevant but lacks accuracy or fidelity to real-world knowledge.</p>
        <p>The most striking observation was the significant disparity in the existence and accuracy of the DOIs between the two domains. In natural sciences topics, DOIs were real in 70.9% of the generated citations, representing a significantly higher rate compared to the low rate of 38.3% real DOIs in the humanities topics. The discrepancies in the DOI existence and accuracy in the two domains may be attributed to the differential adoption and availability of DOIs across academic disciplines, where the natural sciences literature has often been more proactive in adopting the DOI system of referencing and linking to scholarly works than the humanities. It is a general practice that journals publishing on the natural sciences frequently mandate DOI inclusion, whereas publishers in the humanities have been slower to adopt such standards [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref22">22</xref>]. Consequently, the performance of the ChatGPT LLM in generating accurate DOIs appears to reflect these disciplinary disparities.</p>
        <p>LLMs may generate fictional “facts” presented as true “real-world facts,” which is referred to as hallucination [<xref ref-type="bibr" rid="ref19">19</xref>,<xref ref-type="bibr" rid="ref23">23</xref>]. In this study, we considered hallucination to have occurred if the DOI of the generated citation was not real or was real but was linked to a different source. DOI hallucination was more frequent in the humanities (89.4%) than in the natural sciences (61.8%). This finding may be explained by the broader and less structured nature of the humanities literature. There is also a high tendency to provide citations from books and other media that do not use DOIs in the humanities. Therefore, researchers in the humanities should not consider DOIs generated by ChatGPT. Even when ChatGPT generates DOIs for humanities citations, they are more likely to deviate from the correct DOI, potentially leading to the inability to access the cited sources and use the DOIs in citation management tools such as EndNote.</p>
        <p>In contrast to the disparities observed in DOI-related metrics, our study found a remarkable consistency in the existence, validity, and relevance of the generated citations in the natural sciences and humanities, with real citations found 72.7% and 76.6% of the time and accurate citations confirmed in 67.3% and 61.7% of cases, respectively. This suggests that the citations generated by ChatGPT can be expected to be reliable approximately 60% of the time.</p>
        <p>The divergent performance of ChatGPT between the natural sciences and humanities underscores the importance of considering disciplinary nuances when implementing AI-driven writing assistants in academic contexts. Researchers and writers in both domains should be aware of the strengths and limitations of such tools, particularly in relation to citation practices and DOI accuracy. Future research could delve deeper into the factors influencing DOI accuracy and explore strategies for improving DOI generation by LLMs in the humanities literature. Additionally, the development of domain-specific AI writing models may offer tailored solutions to enhance citation and reference accuracy in various academic disciplines.</p>
        <p>In this study, we focused only on the potential use of LLMs in citations and references in scholarly writing; however, the scope to which these models are going to be adopted in academic works is much broader. We believe that these models will be improved over time and that they are here to stay. As such, our argument in this paper is not that LLMs should not be used in scholarly writing, but rather that in their iteration, we ought to be aware of their limitations, primarily concerning the reliability of not only the text they generate but also how they interpret that text.</p>
        <p>Although the transformer models that are the foundation of LLMs are very capable of handling a significant amount of information, they still do have context-window limitations. The context window is the textual range or span of the input that the LLM can evaluate to generate a response at any given moment. As an example, GPT-3 has a context window of 2000 tokens, whereas GPT-4’s context window is 32,000 tokens. As such, since the size of the context window impacts model performance (larger is better), GPT-4 outperforms GPT-3 (at the cost of more computation and memory). In scientific knowledge, context is key. Removing a word from the context may greatly affect the information being conveyed. Therefore, we believe that the future of LLMs in academia will rely on fine-tuning the LLMs to capitalize on memorization where necessary, reproducibility and stability of the models, as well as access to the latest information rather than only the training data.</p>
      </sec>
      <sec>
        <title>Limitations</title>
        <p>There were several limitations to this study. The study included a limited number of topics (10 in total), which can only offer insight but cannot possibly cover the full spectrum of complexity and diversity within the two disciplines. Only ChatGPT 3.5 was prompted since it is the most widely used LLM for this purpose and has a free tier that the majority of users rely on. Newer models, including GPT-4, Claude+, and Google’s Gemini, may give significantly different results. Our study focused on the accuracy of citations and DOIs without an exploration of potential user feedback or subjective assessment of the overall quality and coherence of the generated content. These limitations can be addressed in future research.</p>
      </sec>
      <sec>
        <title>Conclusion</title>
        <p>In conclusion, our study provides valuable insights into the performance of ChatGPT in generating citations and references across interdisciplinary domains. These findings contribute to the ongoing discourse on the use of LLMs in scholarly writing, emphasizing the need for nuanced consideration of discipline-specific challenges and the importance of robust validation processes to ensure the accuracy and reliability of generated content.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>List of included topics and ChatGPT 3.5 prompt structure.</p>
        <media xlink:href="jmir_v26i1e52935_app1.docx" xlink:title="DOCX File , 15 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">BERT</term>
          <def>
            <p>bidirectional encoder representations from transformers</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">DOI</term>
          <def>
            <p>Digital Object Identifier</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">RNN</term>
          <def>
            <p>recurrent neural net</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>This work was supported by the Municipal Key Technical Research and Development Program of Ningbo (2023Z160).</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>The data sets generated during and/or analyzed during this study are available from the corresponding author upon reasonable request.</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="conflict">
        <p>None declared.</p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Golan</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Reddy</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Muthigi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Ramasamy</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence in academic writing: a paradigm-shifting technological advance</article-title>
          <source>Nat Rev Urol</source>
          <year>2023</year>
          <month>06</month>
          <volume>20</volume>
          <issue>6</issue>
          <fpage>327</fpage>
          <lpage>328</lpage>
          <pub-id pub-id-type="doi">10.1038/s41585-023-00746-x</pub-id>
          <pub-id pub-id-type="medline">36829078</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41585-023-00746-x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Checco</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bracciale</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Loreti</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Pinfield</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bianchi</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>AI-assisted peer review</article-title>
          <source>Humanit Soc Sci Commun</source>
          <year>2021</year>
          <month>01</month>
          <day>25</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>25</fpage>
          <pub-id pub-id-type="doi">10.1057/s41599-020-00703-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hutson</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Could AI help you to write your next paper?</article-title>
          <source>Nature</source>
          <year>2022</year>
          <month>11</month>
          <day>31</day>
          <volume>611</volume>
          <issue>7934</issue>
          <fpage>192</fpage>
          <lpage>193</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-022-03479-w</pub-id>
          <pub-id pub-id-type="medline">36316468</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-022-03479-w</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Conroy</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Scientists used ChatGPT to generate an entire paper from scratch - but is it any good?</article-title>
          <source>Nature</source>
          <year>2023</year>
          <month>07</month>
          <volume>619</volume>
          <issue>7970</issue>
          <fpage>443</fpage>
          <lpage>444</lpage>
          <pub-id pub-id-type="doi">10.1038/d41586-023-02218-z</pub-id>
          <pub-id pub-id-type="medline">37419951</pub-id>
          <pub-id pub-id-type="pii">10.1038/d41586-023-02218-z</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Paskin</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Digital object identifier (DOI®) system</article-title>
          <source>Encyclopedia of Library and Information Sciences</source>
          <year>2010</year>
          <publisher-loc>Milton Park</publisher-loc>
          <publisher-name>Taylor and Francis</publisher-name>
          <fpage>1586</fpage>
          <lpage>1592</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hosseini</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Horbach</surname>
              <given-names>SPJM</given-names>
            </name>
          </person-group>
          <article-title>Fighting reviewer fatigue or amplifying bias? Considerations and recommendations for use of ChatGPT and other large language models in scholarly peer review</article-title>
          <source>Res Integr Peer Rev</source>
          <year>2023</year>
          <month>05</month>
          <day>18</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>4</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://researchintegrityjournal.biomedcentral.com/articles/10.1186/s41073-023-00133-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s41073-023-00133-5</pub-id>
          <pub-id pub-id-type="medline">37198671</pub-id>
          <pub-id pub-id-type="pii">10.1186/s41073-023-00133-5</pub-id>
          <pub-id pub-id-type="pmcid">PMC10191680</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Santra</surname>
              <given-names>PP</given-names>
            </name>
            <name name-style="western">
              <surname>Majhi</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>Scholarly communication and machine-generated text: is it finally AI vs AI in plagiarism detection?</article-title>
          <source>J Inf Knowl</source>
          <year>2023</year>
          <month>07</month>
          <day>01</day>
          <volume>60</volume>
          <issue>3</issue>
          <fpage>175</fpage>
          <lpage>183</lpage>
          <pub-id pub-id-type="doi">10.17821/srels/2023/v60i3/171028</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Esplugas</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>The use of artificial intelligence (AI) to enhance academic communication, education and research: a balanced approach</article-title>
          <source>J Hand Surg Eur Vol</source>
          <year>2023</year>
          <month>09</month>
          <day>07</day>
          <volume>48</volume>
          <issue>8</issue>
          <fpage>819</fpage>
          <lpage>822</lpage>
          <pub-id pub-id-type="doi">10.1177/17531934231185746</pub-id>
          <pub-id pub-id-type="medline">37417005</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>AHS</given-names>
            </name>
          </person-group>
          <article-title>Analysis of ChatGPT tool to assess the potential of its utility for academic writing in biomedical domain</article-title>
          <source>BEMS Reports</source>
          <year>2023</year>
          <month>01</month>
          <day>05</day>
          <volume>9</volume>
          <issue>1</issue>
          <fpage>24</fpage>
          <lpage>30</lpage>
          <pub-id pub-id-type="doi">10.5530/bems.9.1.5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marchandot</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Matsushita</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Carmona</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Trimaille</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Morel</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT: the next frontier in academic writing for cardiologists or a pandora's box of ethical dilemmas</article-title>
          <source>Eur Heart J Open</source>
          <year>2023</year>
          <month>03</month>
          <volume>3</volume>
          <issue>2</issue>
          <fpage>oead007</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36915398"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ehjopen/oead007</pub-id>
          <pub-id pub-id-type="medline">36915398</pub-id>
          <pub-id pub-id-type="pii">oead007</pub-id>
          <pub-id pub-id-type="pmcid">PMC10006694</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Zhan</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT in scientific writing: a cautionary tale</article-title>
          <source>Am J Med</source>
          <year>2023</year>
          <month>08</month>
          <volume>136</volume>
          <issue>8</issue>
          <fpage>725</fpage>
          <lpage>726</lpage>
          <pub-id pub-id-type="doi">10.1016/j.amjmed.2023.02.011</pub-id>
          <pub-id pub-id-type="medline">36906169</pub-id>
          <pub-id pub-id-type="pii">S0002-9343(23)00159-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alkaissi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>McFarlane</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>02</month>
          <volume>15</volume>
          <issue>2</issue>
          <fpage>e35179</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36811129"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id>
          <pub-id pub-id-type="medline">36811129</pub-id>
          <pub-id pub-id-type="pmcid">PMC9939079</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vaswani</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shazeer</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Parmar</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Uszkoreit</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Gomez</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Kaiser</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Polosukhin</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Attention is all you need</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on June 12, 2017. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1706.03762"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bahdanau</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Cho</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Neural machine translation by jointly learning to align and translate</article-title>
          <source>arXiv</source>
          <comment>Preprint posted on September 1, 2014. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1409.0473"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Radford</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Narasimhan</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Salimans</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Sutskever</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <article-title>Improving language understanding by generative pre-training</article-title>
          <source>OpenAI</source>
          <year>2018</year>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openai.com/research/language-unsupervised">https://openai.com/research/language-unsupervised</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Devlin</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Chang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Toutanova</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Bert: Pre-training of deep bidirectional transformers for language understanding</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on October 11, 2018</comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.1810.04805</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Degelman</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>APA style essentials</article-title>
          <source>Vanguard University</source>
          <year>2000</year>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.vanguard.edu/uploaded/research/apa_style_guide/apastyleessentials.pdf">https://www.vanguard.edu/uploaded/research/apa_style_guide/apastyleessentials.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="web">
          <source>DOI Foundation</source>
          <access-date>2024-04-01</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/">https://doi.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salvagno</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Taccone</surname>
              <given-names>FS</given-names>
            </name>
            <name name-style="western">
              <surname>Gerli</surname>
              <given-names>AG</given-names>
            </name>
          </person-group>
          <article-title>Artificial intelligence hallucinations</article-title>
          <source>Crit Care</source>
          <year>2023</year>
          <month>05</month>
          <day>10</day>
          <volume>27</volume>
          <issue>1</issue>
          <fpage>180</fpage>
          <pub-id pub-id-type="doi">10.1186/s13054-023-04473-y</pub-id>
          <pub-id pub-id-type="medline">37165401</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13054-023-04473-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC10170715</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yujian</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bo</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>A normalized Levenshtein distance metric</article-title>
          <source>IEEE Trans Pattern Anal Machine Intell</source>
          <year>2007</year>
          <month>06</month>
          <volume>29</volume>
          <issue>6</issue>
          <fpage>1091</fpage>
          <lpage>1095</lpage>
          <pub-id pub-id-type="doi">10.1109/tpami.2007.1078</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Eve</surname>
              <given-names>MP</given-names>
            </name>
          </person-group>
          <source>Open Access and the Humanities</source>
          <year>2014</year>
          <publisher-loc>Cambridge, UK</publisher-loc>
          <publisher-name>Cambridge University Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Narayan</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Luca</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Tiffen</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>England</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Booth</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Boateng</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Scholarly communication practices in humanities and social sciences: a study of researchers' attitudes and awareness of open access</article-title>
          <source>Open Inf Sci</source>
          <year>2018</year>
          <volume>2</volume>
          <issue>1</issue>
          <fpage>168</fpage>
          <lpage>180</lpage>
          <pub-id pub-id-type="doi">10.1515/opis-2018-0013</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Cui</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cai</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Zhao</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Luu</surname>
              <given-names>AT</given-names>
            </name>
            <name name-style="western">
              <surname>Bi</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Shi</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Siren's song in the AI ocean: a survey on hallucination in large language models</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on September 3, 2023. <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2309.01219"/>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
