<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "http://dtd.nlm.nih.gov/publishing/2.0/journalpublishing.dtd">
<article xmlns:xlink="http://www.w3.org/1999/xlink" article-type="research-article" dtd-version="2.0">
  <front>
    <journal-meta>
      <journal-id journal-id-type="publisher-id">JMIR</journal-id>
      <journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id>
      <journal-title>Journal of Medical Internet Research</journal-title>
      <issn pub-type="epub">1438-8871</issn>
      <publisher>
        <publisher-name>JMIR Publications</publisher-name>
        <publisher-loc>Toronto, Canada</publisher-loc>
      </publisher>
    </journal-meta>
    <article-meta>
      <article-id pub-id-type="publisher-id">v27i1e77893</article-id>
      <article-id pub-id-type="pmid">40825542</article-id>
      <article-id pub-id-type="doi">10.2196/77893</article-id>
      <article-categories>
        <subj-group subj-group-type="heading">
          <subject>Original Paper</subject>
        </subj-group>
        <subj-group subj-group-type="article-type">
          <subject>Original Paper</subject>
        </subj-group>
      </article-categories>
      <title-group>
        <article-title>Magnitude and Impact of Hallucinations in Tabular Synthetic Health Data on Prognostic Machine Learning Models: Validation Study</article-title>
      </title-group>
      <contrib-group>
        <contrib contrib-type="editor">
          <name>
            <surname>Sarvestan</surname>
            <given-names> Javad</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Beristain</surname>
            <given-names>Andoni</given-names>
          </name>
        </contrib>
        <contrib contrib-type="reviewer">
          <name>
            <surname>Vega-Marquez</surname>
            <given-names>Belen</given-names>
          </name>
        </contrib>
      </contrib-group>
      <contrib-group>
        <contrib id="contrib1" contrib-type="author">
          <name name-style="western">
            <surname>Pilgram</surname>
            <given-names>Lisa</given-names>
          </name>
          <degrees>MD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <xref rid="aff3" ref-type="aff">3</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-1020-0650</ext-link>
        </contrib>
        <contrib id="contrib2" contrib-type="author">
          <name name-style="western">
            <surname>El Kababji</surname>
            <given-names>Samer</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0002-7642-2280</ext-link>
        </contrib>
        <contrib id="contrib3" contrib-type="author">
          <name name-style="western">
            <surname>Liu</surname>
            <given-names>Dan</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0001-9632-4736</ext-link>
        </contrib>
        <contrib id="contrib4" contrib-type="author" corresp="yes">
          <name name-style="western">
            <surname>El Emam</surname>
            <given-names>Khaled</given-names>
          </name>
          <degrees>PhD</degrees>
          <xref rid="aff1" ref-type="aff">1</xref>
          <xref rid="aff2" ref-type="aff">2</xref>
          <address>
            <institution>CHEO Research Institute</institution>
            <institution>Children's Hospital of Eastern Ontario</institution>
            <addr-line>401 Smyth Road</addr-line>
            <addr-line>Ottawa, ON, K1H 5B2</addr-line>
            <country>Canada</country>
            <phone>1 7377600</phone>
            <email>kelemam@ehealthinformation.ca</email>
          </address>
          <ext-link ext-link-type="orcid">https://orcid.org/0000-0003-3325-4149</ext-link>
        </contrib>
      </contrib-group>
      <aff id="aff1">
        <label>1</label>
        <institution>School of Epidemiology and Public Health</institution>
        <institution>Faculty of Medicine</institution>
        <institution>University of Ottawa</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff2">
        <label>2</label>
        <institution>CHEO Research Institute</institution>
        <institution>Children's Hospital of Eastern Ontario</institution>
        <addr-line>Ottawa, ON</addr-line>
        <country>Canada</country>
      </aff>
      <aff id="aff3">
        <label>3</label>
        <institution>Department of Nephrology and Medical Intensive Care</institution>
        <institution>Charité - Universitaetsmedizin Berlin</institution>
        <addr-line>Berlin</addr-line>
        <country>Germany</country>
      </aff>
      <author-notes>
        <corresp>Corresponding Author: Khaled El Emam <email>kelemam@ehealthinformation.ca</email></corresp>
      </author-notes>
      <pub-date pub-type="collection">
        <year>2025</year>
      </pub-date>
      <pub-date pub-type="epub">
        <day>18</day>
        <month>8</month>
        <year>2025</year>
      </pub-date>
      <volume>27</volume>
      <elocation-id>e77893</elocation-id>
      <history>
        <date date-type="received">
          <day>21</day>
          <month>5</month>
          <year>2025</year>
        </date>
        <date date-type="rev-request">
          <day>18</day>
          <month>6</month>
          <year>2025</year>
        </date>
        <date date-type="rev-recd">
          <day>15</day>
          <month>7</month>
          <year>2025</year>
        </date>
        <date date-type="accepted">
          <day>21</day>
          <month>7</month>
          <year>2025</year>
        </date>
      </history>
      <copyright-statement>©Lisa Pilgram, Samer El Kababji, Dan Liu, Khaled El Emam. Originally published in the Journal of Medical Internet Research (https://www.jmir.org), 18.08.2025.</copyright-statement>
      <copyright-year>2025</copyright-year>
      <license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/">
        <p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (https://creativecommons.org/licenses/by/4.0/), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on https://www.jmir.org/, as well as this copyright and license information must be included.</p>
      </license>
      <self-uri xlink:href="https://www.jmir.org/2025/1/e77893" xlink:type="simple"/>
      <abstract>
        <sec sec-type="background">
          <title>Background</title>
          <p>Generative artificial intelligence (AI) for tabular synthetic data generation (SDG) has significant potential to accelerate health care research and innovation. A critical limitation of generative AI, however, is hallucinations. Although this has been commonly observed in text-generating models, it may also occur in tabular SDG.</p>
        </sec>
        <sec sec-type="objective">
          <title>Objective</title>
          <p>This study aims to investigate the magnitude of hallucinations in tabular synthetic data, whether their frequency increases with training data complexity, and the extent to which they impact the utility of synthetic data for downstream prognostic machine learning (ML) modeling tasks.</p>
        </sec>
        <sec sec-type="methods">
          <title>Methods</title>
          <p>On the basis of 12 large and high-dimensional real-world health care datasets, 6354 training datasets of different complexity were created by varying the subset of variables included in each dataset. Synthetic data were generated using 7 different SDG models. Hallucinations were defined as synthetic records that did not exist in the population, and the hallucination rate (HR) was the proportion of hallucinations in a synthetic dataset. Classification was the downstream prognostic modeling task, conducted via an ML approach (light gradient boosted machine) and an artificial neural network (multilayer perceptron). Mixed-effects models were fitted to examine the relationship between training data complexity and the HR and the HR and the predictive performance of AI and ML models when trained on the synthetic data.</p>
        </sec>
        <sec sec-type="results">
          <title>Results</title>
          <p>The HR ranged from 0.3% to 100% (median 99.1%, IQR 98.5%-100.0%) and increased with training data complexity. However, in most SDG models, the HR did not affect AI and ML prognostic model performance. In the SDG models in which a significant association was detected, the estimated effect was very small, with a maximum decrease in the area under the receiver operating characteristic curve of –0.0002 (95% CI –0.0003 to –0.0002, <italic>P</italic>&#60;.001) in light gradient boosting machine and –0.0001 (95% CI –0.0002 to –0.0001, <italic>P</italic>=.002) in multilayer perceptron.</p>
        </sec>
        <sec sec-type="conclusions">
          <title>Conclusions</title>
          <p>These findings suggest that while hallucinations may be very common in synthetic tabular health data, they do not necessarily impair its utility for prognostic modeling.</p>
        </sec>
      </abstract>
      <kwd-group>
        <kwd>synthetic data</kwd>
        <kwd>data utility</kwd>
        <kwd>hallucinations</kwd>
        <kwd>generative models</kwd>
        <kwd>artificial intelligence</kwd>
        <kwd>AI</kwd>
      </kwd-group>
    </article-meta>
  </front>
  <body>
    <sec sec-type="introduction">
      <title>Introduction</title>
      <p>Generative models are a class of artificial intelligence (AI) and machine learning (ML) models that create new data from the input data they were trained on. During the training process, generative models learn the underlying joint probability distribution of the training data and sample output data from that distribution.</p>
      <sec>
        <title>Hallucinations in Generative Image and Text Modeling</title>
        <p>The term “hallucination” in generative modeling first appeared in the context of creating high-resolution images from low-resolution input [<xref ref-type="bibr" rid="ref1">1</xref>]. It described the ability of a model to generate output that exceeded the information learned from its input. This was considered a positive feature as face recognition or verification applications required high-resolution images; yet, often only low-resolution images were available. Models that generated such hallucinations were able to output high-resolution face images based on a lower-quality input and were built upon convolutional neural networks [<xref ref-type="bibr" rid="ref2">2</xref>] or generative adversarial networks [<xref ref-type="bibr" rid="ref3">3</xref>-<xref ref-type="bibr" rid="ref11">11</xref>].</p>
        <p>With the rise of large language models (LLMs), such as generative pretrained transformers, the term “hallucination” became more popular and took on the meaning that we currently use. It describes a specific form of generated output that can be seen as implausible, inconsistent, or nonexistent. Ji et al [<xref ref-type="bibr" rid="ref12">12</xref>] define it as “generated content that is nonsensical or unfaithful to the provided source content.” This means hallucinations distinguish themselves from other types of output by a certain degree of unexpectedness and a higher deviation from training data. Today, 2 different notions of hallucinations are commonly used. The first one captures violations of the concept of <italic>factuality</italic> where the real world is used as the benchmark, while the second one is based on <italic>faithfulness,</italic> which describes consistency and truthfulness to the training data [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        <p>Hallucinations in the context of LLMs are largely seen as problematic. Multiple authors warn of overreliance on LLMs, particularly due to potential hallucinations that may be misleading [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>]. In evaluation studies across various sectors, generic LLMs were shown to produce hallucinations [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref18">18</xref>]. For example, nontrivial deviations from the real world have been detected in generated scientific reports [<xref ref-type="bibr" rid="ref19">19</xref>], and LLMs have been found to have limited ability to provide genuine references [<xref ref-type="bibr" rid="ref20">20</xref>-<xref ref-type="bibr" rid="ref22">22</xref>].</p>
      </sec>
      <sec>
        <title>The Challenge With Hallucinations in Health Care</title>
        <p>Hallucinations are particularly harmful in fields such as medicine where there is little room for error and decisions can have severe consequences [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. The National Academies of Sciences, Engineering, and Medicine consequently lists hallucinations as one of the major risks of generative AI in the health care sector, alongside concerns such as privacy, bias, output limitations, and algorithmic brittleness [<xref ref-type="bibr" rid="ref14">14</xref>]. Medical hallucinations in the context of LLMs have been broadly defined as “incorrect or misleading medical information that could adversely affect clinical decision making and patient outcomes” [<xref ref-type="bibr" rid="ref25">25</xref>].</p>
        <p>This definition encompasses the notion of <italic>factuality</italic> as it evaluates the generated content against the real world. In addition, it extends beyond <italic>factuality</italic> by including any medical information that is misleading, such as biased conclusions or reasoning errors, and explicitly considering the potential harm that may result from such hallucinations. This broader definition shows that in the health care sector, LLM-generated hallucinations are viewed primarily through the lens of potential harmful consequences. Such consequences can be related to patient safety but also include the erosion of trust in AI and ML systems, increased workload or workflow disruptions in clinical settings, and unresolved ethical and legal questions about accountability [<xref ref-type="bibr" rid="ref14">14</xref>,<xref ref-type="bibr" rid="ref25">25</xref>].</p>
      </sec>
      <sec>
        <title>Hallucinations in Generative Tabular Modeling</title>
        <p>Synthetic data generation (SDG) represents another form of generative modeling where synthetic tabular data are created by a model. Although SDG can be based on distributions known a priori and informed by background knowledge, published summary statistics, or established risk calculators [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref30">30</xref>], our focus here is on synthetic data generated based on a real dataset that is used to train a generative model, which outputs a fully synthetic tabular dataset.</p>
        <p>Most research in tabular SDG focuses on improving and evaluating SDG models in terms of utility, privacy, and fairness [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>]. The goal is to mimic the statistical properties of real data while maintaining low disclosure risks and avoiding bias in the generated synthetic data to ultimately ensure that the synthetic data perform well in downstream tasks. However, the concept of hallucinations has not been precisely defined or evaluated in the context of tabular SDG.</p>
      </sec>
      <sec>
        <title>Objectives</title>
        <p>This study aimed to evaluate (1) the extent to which generated synthetic health data contain hallucinations, which has not been previously studied; (2) the impact of dataset complexity on the occurrence of hallucinated records, the hypothesis being that datasets with higher complexity will have a higher rate of hallucinations; and (3) the association between the rate of hallucinations and the performance of prognostic AI and ML models, the hypothesis being that the greater the rate of hallucinations, the less effective the prognostic models would be.</p>
      </sec>
    </sec>
    <sec sec-type="methods">
      <title>Methods</title>
      <sec>
        <title>Definition of Hallucinations in Tabular Synthetic Data</title>
        <p>Utility in synthetic data has been typically defined in terms of fidelity and downstream utility. Fidelity means that the synthetic data are similar to the training data, and metrics can be used to indicate how close the records are [<xref ref-type="bibr" rid="ref33">33</xref>-<xref ref-type="bibr" rid="ref35">35</xref>]. For example, the Hellinger distance measures similarity in multivariate distributions; the cluster metric compares the clustering structure [<xref ref-type="bibr" rid="ref34">34</xref>]. The training dataset serves as a basis for comparison, and high-fidelity synthetic data are data that resemble the training data very well. This is similar to the aforementioned concept of <italic>faithfulness</italic>. A violation of fidelity can be seen as diversity (<xref rid="figure1" ref-type="fig">Figure 1</xref>). Diverse records are those that are not similar to the training data but are still quite similar to the population from which the training data were drawn. In SDG, the goal is typically not to have complete <italic>faithfulness</italic> to the training data, as this could expose individuals’ personal information. Instead, diverse records that maintain the statistical properties of the population can be privacy preserving while supporting, for example, a prognostic model to generalize better and perform reasonably well on unseen data from the same population.</p>
        <p>Hallucinations in tabular synthetic data can be defined as synthetic records that are nonexistent in the population (<xref rid="figure1" ref-type="fig">Figure 1</xref>). This can be because they are implausible (eg, a female individual with prostate cancer) or are plausible but just do not exist in the population (eg, there is no male individual in a specific population of patients with breast cancer). It thereby incorporates the concept of <italic>factuality</italic> rather than <italic>faithfulness</italic> as the evaluation is performed with reference to the population and not the training dataset [<xref ref-type="bibr" rid="ref36">36</xref>].</p>
        <p>It has been argued that hallucinations represent the low-likelihood outputs of a model [<xref ref-type="bibr" rid="ref37">37</xref>]. Consequently, as for any generative model, we can assume that SDG leads to hallucinated records. However, it is unknown to what extent this happens in tabular SDG. In addition, one can reasonably argue that training a prognostic model on datasets with hallucinated records may degrade the performance of the model on unseen (ie, holdout) data, as the model would learn patterns that are not, and cannot be, in unseen data from the same population. Therefore, in addition to hallucinations eroding trust in synthetic data, they may have the practical consequence of reducing the performance of at least prognostic analytic workloads with the synthetic data.</p>
        <fig id="figure1" position="float">
          <label>Figure 1</label>
          <caption>
            <p>Hallucinations in synthetic data. The green circle represents the synthetic data (S). Within S, high-fidelity records are synthetic records that are similar to the training data (dotted portion); diverse records are the ones that are not similar to the training data (T) but to the population (P; dense dotted portion); hallucinated records are those that cannot be considered as being representative of the training or the population data (striped portion).</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e77893_fig1.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Study Workflow</title>
        <p>The overall workflow of this study included five major steps:</p>
        <list list-type="order">
          <list-item>
            <p>Creation of population variants with varying complexity from 12 real-world health care populations</p>
          </list-item>
          <list-item>
            <p>Sampling a training dataset from each population variant to train 7 different SDG models, spanning from more traditional statistical to deep learning models</p>
          </list-item>
          <list-item>
            <p>Generating 10 synthetic datasets from each trained SDG model and identification of hallucinated records in each of the synthetic datasets</p>
          </list-item>
          <list-item>
            <p>Assessing the downstream predictive modeling performance in each of the synthetic and training datasets via light gradient boosted decision trees (LGBM) and multilayer perceptron (MLP)</p>
          </list-item>
          <list-item>
            <p>Estimating the effect of complexity on hallucinations as well as the effect of hallucinations on downstream modeling</p>
          </list-item>
        </list>
        <p>The creation of population variants from real-world health care populations and subsequent SDG (steps 1 and 2) is demonstrated in <xref rid="figure2" ref-type="fig">Figure 2</xref> and can be summarized as mentioned subsequently. For each real-world population, diverse population variants with the same records but varying numbers and combinations of variables were created to capture a large space of dataset complexity. A random sample of 10,000 records was then defined as a training dataset and a disjoint random sample of 10,000 records as a holdout dataset. From each population variant, the same training and holdout sample was taken to train the 7 SDG models and generate 10 synthetic datasets each to account for the stochasticity of the generative process.</p>
        <p>All steps were conducted in parallel on containerized execution environment within the hospital high-performance computing infrastructure with a total of 13 graphics processing units (NVIDIA RTX A6000, each with 48 GB of memory) and 256 central processing unit cores (1 TB of available memory). Runtime varied depending on the complexity of the population variant and the SDG model, with steps 2 and 4 being the most computationally demanding steps in the overall workflow. For 1 population variant, the runtime of step 2 (ie, SDG via 7 SDG models) varied between 180 seconds and 3780 seconds (depending on the complexity of the population variant) and the runtime of step 4 (ie, downstream model training) between 46 seconds and 104 seconds for LGBM and between 62 seconds and 139 seconds for MLP (depending on the downstream task). The runtime of step 5 (ie, effect estimation across all population variants and SDG models) took approximately 1800 seconds in total.</p>
        <fig id="figure2" position="float">
          <label>Figure 2</label>
          <caption>
            <p>Creation of population (P) variants and synthetic data generation (SDG). For each real-world health care reference population (R), a core population was defined as P0 and included the core variables as defined by the downstream modeling task of R. By varying numbers and combinations of adjunct variables, z additional population variants with different levels of complexity were created (P1-Pz) so that each variant was a subset of R (P⊂R). The number of records remained the same. From these population variants, the SDG training dataset (T) was taken (subset T⊂P). The holdout dataset (H) was a disjoint subset from the same population variant, explicitly excluding all records used in the training dataset (H⊂P \ T). Across all variants, the same subset of records was selected as training and holdout datasets, respectively. Ten synthetic datasets (eg, G3.1-G3.10) were generated per SDG model (G1-G7). ⊂: proper subset (subset of randomly drawn or selected records); \: complement; ~: SDG; G: generator (SDG model).</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e77893_fig2.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Creation of Population Variants</title>
        <p>For this study, large datasets were needed to simulate a reference population. We used the real-world datasets listed in <xref ref-type="table" rid="table1">Table 1</xref>. These datasets cover a wide range of typical characteristics (eg, class imbalance, missing values, and noisy variables) that are encountered when working with real-world health data [<xref ref-type="bibr" rid="ref38">38</xref>,<xref ref-type="bibr" rid="ref39">39</xref>]. Furthermore, the datasets cover multiple domains, including hospital discharge, adverse events, public health, health surveys, and population registries.</p>
        <p>In this study, we use the term <italic>reference population</italic> to refer to the real-world dataset with its full set of records and variables. We hypothesized that the complexity of a dataset would contribute to the occurrence of hallucinations. To capture various complexities for one reference population, we derived <italic>population variants</italic> from it by varying its dimensionality. These population variants were built by adding <italic>adjunct</italic> variables to a <italic>core</italic> set of variables, and we refer to the dataset with the <italic>core</italic> variables as the <italic>core</italic> population. This means that population variants shared the same (entire) set of records but included different subsets of variables. The general term <italic>population</italic> refers to their provenance (ie, the reference population) and is used as a label for grouping rather than to describe any particular dataset. The <italic>core</italic> variables were determined by the downstream modeling task and are specific to the population. Details on the datasets, their downstream modeling tasks, and the core variables of each are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> [<xref ref-type="bibr" rid="ref23">23</xref>-<xref ref-type="bibr" rid="ref28">28</xref>,<xref ref-type="bibr" rid="ref40">40</xref>-<xref ref-type="bibr" rid="ref99">99</xref>].</p>
        <p>Depending on the original dimensionality of the reference population, the selection of the combinations of <italic>adjunct</italic> variables would result in a large combinatorial space, as discussed subsequently.</p>
        <table-wrap position="float" id="table1">
          <label>Table 1</label>
          <caption>
            <p>Characteristics of real-world populations<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="210"/>
            <col width="350"/>
            <col width="120"/>
            <col width="100"/>
            <col width="100"/>
            <col width="120"/>
            <thead>
              <tr valign="top">
                <td>Population</td>
                <td>Brief description</td>
                <td>Core<sup>b</sup> variables, n</td>
                <td>Pool size<sup>c</sup>, n</td>
                <td>Variants<sup>d</sup>, n</td>
                <td>Reference Population size, n</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>BORN<sup>e</sup></td>
                <td>Birth registry in the province of Ontario, Canada, with information about pregnancy and birth</td>
                <td>20</td>
                <td>101</td>
                <td>700</td>
                <td>968,435</td>
              </tr>
              <tr valign="top">
                <td>California hospital discharges 2008 (California)</td>
                <td>Discharge dataset from hospitals in California, United States, from 2007</td>
                <td>15</td>
                <td>387</td>
                <td>601</td>
                <td>4,017,998</td>
              </tr>
              <tr valign="top">
                <td>CCHS<sup>f</sup></td>
                <td>Canadian population survey with health information</td>
                <td>13</td>
                <td>121</td>
                <td>723</td>
                <td>904,813</td>
              </tr>
              <tr valign="top">
                <td>Canadian COVID-19 (COVID-19)</td>
                <td>Canadian COVID-19 dataset</td>
                <td>6</td>
                <td>5</td>
                <td>32</td>
                <td>1,384,881</td>
              </tr>
              <tr valign="top">
                <td>FAERS<sup>g</sup></td>
                <td>Dataset of adverse events submitted to the FDA<sup>h</sup>, United States</td>
                <td>9</td>
                <td>27</td>
                <td>614</td>
                <td>881,204</td>
              </tr>
              <tr valign="top">
                <td>Florida hospital discharges 2007 (Florida)</td>
                <td>Discharge dataset from hospitals in Florida, United States, from 2007</td>
                <td>10</td>
                <td>293</td>
                <td>601</td>
                <td>2,563,370</td>
              </tr>
              <tr valign="top">
                <td>MIMIC-III<sup>i</sup></td>
                <td>Data from intensive care unit admissions of the Beth Israel Deaconess Medical Center, United States</td>
                <td>13</td>
                <td>4</td>
                <td>16</td>
                <td>30,662</td>
              </tr>
              <tr valign="top">
                <td>New York hospital discharges 2007 (New York)</td>
                <td>Discharge dataset from hospitals in New York, United States, from 2007</td>
                <td>13</td>
                <td>317</td>
                <td>601</td>
                <td>2,608,615</td>
              </tr>
              <tr valign="top">
                <td>Nexoid COVID-19 survival calculator data (Nexoid)</td>
                <td>Web-based survey data concerning COVID-19 provided by a company in London, United Kingdom</td>
                <td>19</td>
                <td>36</td>
                <td>622</td>
                <td>968,408</td>
              </tr>
              <tr valign="top">
                <td>Texas inpatient public use data file (Texas)</td>
                <td>Discharge dataset from hospitals in Texas, United States</td>
                <td>10</td>
                <td>65</td>
                <td>642</td>
                <td>745,999</td>
              </tr>
              <tr valign="top">
                <td>Washington state hospital discharges 2007 (Washington)</td>
                <td>Discharge dataset from hospitals in Washington, United States, from 2007</td>
                <td>8</td>
                <td>349</td>
                <td>601</td>
                <td>644,902</td>
              </tr>
              <tr valign="top">
                <td>Washington state hospital discharges 2008 (Washington 2008)</td>
                <td>Discharge dataset from hospitals in Washington, United States, from 2008</td>
                <td>17</td>
                <td>407</td>
                <td>601</td>
                <td>652,344</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table1fn1">
              <p><sup>a</sup>The reference populations were transformed to be based on individual-level (not event-level) observations. For the Better Outcomes Registry &#38; Network population, the individual was the newborn. The exception was the US Food and Drug Administration Adverse Event Reporting System, which could not be transformed due to the absence of a unique identifier; however, given that adverse events are rare in general, it can be expected that there is a very low number of duplicate individuals.</p>
            </fn>
            <fn id="table1fn2">
              <p><sup>b</sup>Core means the number of variables defined for their downstream task.</p>
            </fn>
            <fn id="table1fn3">
              <p><sup>c</sup>Pool size is the total number of potential <italic>adjunct</italic> variables.</p>
            </fn>
            <fn id="table1fn4">
              <p><sup>d</sup>Variants are subsets derived from the reference population by reducing it to the <italic>core</italic> variables and adding varying <italic>adjunct</italic> variables.</p>
            </fn>
            <fn id="table1fn5">
              <p><sup>e</sup>BORN: Better Outcomes Registry &#38; Network.</p>
            </fn>
            <fn id="table1fn6">
              <p><sup>f</sup>CCHS: Canadian Community Health Survey.</p>
            </fn>
            <fn id="table1fn7">
              <p><sup>g</sup>FAERS: US Food and Drug Administration Adverse Event Reporting System.</p>
            </fn>
            <fn id="table1fn8">
              <p><sup>h</sup>FDA: US Food and Drug Administration.MIMIC-III: Medical Information Mart for Intensive Care III.</p>
            </fn>
            <fn id="table1fn9">
              <p><sup>i</sup>MIMIC-III: Medical Information Mart for Intensive Care III.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>We define <italic>v</italic><sub>0</sub> as the number of variables in the <italic>core</italic> dataset, so those are the ones that are required for a predefined downstream modeling task. <italic>v</italic> is the number of <italic>adjunct</italic> variables that are in the dataset but not required for the downstream modeling task. Then, the dimensionality of a dataset is defined by <italic>v</italic><sub>0</sub>+<italic>v</italic>. The 12 reference populations had varying dimensionalities, so that the maximum number of potential <italic>adjunct</italic> variables varied. This is referred to as pool size <italic>m</italic>. The larger the pool size, the higher the total number of potential combinations. For example, if we want to create a dataset with 2 <italic>adjunct</italic> variables (ie, <italic>v=</italic>2) from a dataset that has 100 potential <italic>adjunct</italic> variables (ie, <italic>m</italic>=100), we have <inline-graphic xlink:href="jmir_v27i1e77893_fig6.png" xlink:type="simple" mimetype="image"/> distinct options to create a population variant by adding 2 <italic>adjunct</italic> variables to the <italic>core</italic> variables. If we considered all potential combinations for any given number of <italic>adjunct</italic> variables, the space of population variants would grow up to 1.267651×10<sup>30</sup> distinct population variants in this example.</p>
        <p>Therefore, to reduce the computational burden, we adopted a random weighted sampling scheme and analyzed, in total, 6354 variants derived from 12 health care reference populations. The sampling process is described in more detail in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Measuring Complexity</title>
        <p>While various complexity metrics for datasets have been described, many of them are specific to a downstream task, such as binary classification tasks [<xref ref-type="bibr" rid="ref100">100</xref>,<xref ref-type="bibr" rid="ref101">101</xref>]. Such metrics measure, for example, the discriminative power of each variable with respect to an outcome variable. As highlighted in the study by Cano [<xref ref-type="bibr" rid="ref100">100</xref>], complexity metrics that include multiple different structural but also distributional characteristics can become challenging to interpret because very different datasets yield similar complexity values.</p>
        <p>Sparsity measures that incorporate both the dimensionality and the size of the dataset focus on the structural characteristics of a dataset and offer a more straightforward interpretation [<xref ref-type="bibr" rid="ref101">101</xref>]. In this study, all training datasets had the same number of records, making size-related information constant. At the same time, dimensionality alone seemed insufficient to comprehensively capture structural complexity.</p>
        <p>Therefore, we considered cardinality, in addition to dimensionality, to obtain a more comprehensive but interpretable proxy for data complexity. The detailed definition, including the mathematical equation, can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>The population variants created in this study covered a large range of complexities, as depicted in <xref rid="figure3" ref-type="fig">Figure 3</xref>.</p>
        <p><xref rid="figure3" ref-type="fig">Figure 3</xref> shows that only a few variants were of low complexity because <italic>adjunct</italic> variables often included high-cardinality variables (eg, diagnosis or medication), thereby increasing dataset complexity.</p>
        <fig id="figure3" position="float">
          <label>Figure 3</label>
          <caption>
            <p>Complexity of population variants. Variants were created from the reference populations, as described, and complexity across all variants was calculated. The boxplots show the median as the central horizontal line, the lower and upper hinges represent the first and third quartiles (ie, IQR), and the whiskers represent the largest values within 1.5 times IQR from the quartiles. BORN: Better Outcomes Registry &#38; Network; CCHS: Canadian Community Health Survey; FAERS: US Food and Drug Administration Adverse Event Reporting System; MIMIC-III: Medical Information Mart for Intensive Care III.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e77893_fig3.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>SDG Models</title>
        <p>In total, 7 different types of SDG models were considered when quantifying and analyzing hallucinations in SDG. In combination with the 6354 population variants, this gives 44,478 trained SDG models, each of which generated 10 synthetic datasets.</p>
        <p>The 7 SDG models were sequential decision trees (STs) [<xref ref-type="bibr" rid="ref102">102</xref>], Bayesian networks [<xref ref-type="bibr" rid="ref103">103</xref>], conditional generative adversarial networks [<xref ref-type="bibr" rid="ref104">104</xref>], variational autoencoders (tabular variational autoencoder and robust tabular variational autoencoder) [<xref ref-type="bibr" rid="ref104">104</xref>], adversarial random forests [<xref ref-type="bibr" rid="ref105">105</xref>], and normalizing flows [<xref ref-type="bibr" rid="ref106">106</xref>]. The details on each of the SDG models are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Identification of Hallucinations</title>
        <p>To assess hallucinations, we focused on the concept of <italic>factfulness</italic> in tabular SDG. Another concept is <italic>faithfulness</italic>. The difference between these concepts is the underlying ground truth. For instance, we will consider an abstractive summarization task, where a section from a travel guide about Canada should be summarized by an LLM. This section does not contain the explicit information that Ottawa is the capital of Canada but lists the biggest cities of Canada. Then, if the output states that Montreal is the capital of Canada, this can be classified as a hallucination in terms of <italic>faithfulness</italic> because the input data had no such information. It would also be considered a hallucination in terms of <italic>factfulness</italic> as it is not aligned with the ground truth. If the LLM’s output is that Ottawa is the capital of Canada based on the same input, this would also be classified as a hallucination in terms of <italic>faithfulness</italic> but not in terms of <italic>factfulness</italic>. This is because <italic>faithfulness</italic> is evaluated based on input data adherence, while <italic>factfulness</italic> requires an external ground truth, making its assessment more challenging [<xref ref-type="bibr" rid="ref12">12</xref>].</p>
        <p>In this study, we focus on <italic>factfulness</italic> because it provides a more meaningful interpretation in tabular synthetic data where some degree of diversity from the training data (so a violation of <italic>faithfulness</italic>) is both expected and desirable [<xref ref-type="bibr" rid="ref107">107</xref>]. <italic>Factful</italic> synthetic records, in contrast, are records that appear in the population variant where the training data are sampled from but may or may not be in the training data. In this study, we then define hallucinations in terms of <italic>factfulness</italic> as synthetic records that are nonexistent in the population variant from which the training data were sampled. This includes records that may be statistically consistent with the distribution of the training data but which nonetheless never appeared in the actual population variant.</p>
        <p>This definition has a clearer interpretation than alternative definitions that rely on semantic or probabilistic similarity and require the specification of thresholds. Such thresholds are difficult to define, particularly in our context where precedents are lacking, and have a nontrivial impact on interpretability. Our definition should also, in principle, be more sensitive than these alternative approaches. However, it is important to note that alternative definitions may lead to different conclusions, as discussed in the Strengths and Limitations section.</p>
        <p>To operationalize our definition, we singled out synthetic records that were nonexistent in the corresponding population variant by matching records between the synthetic and population variant and isolating those that were uniquely present in the synthetic data. The set of hallucinated records (HA) is then the difference between the synthetic data (S) and the population variant (P), calculated as follows:</p>
        <disp-formula><italic>HA</italic> = <italic>S \ P</italic></disp-formula>
        <p>More precisely, we applied row-wise antijoin between the synthetic data and the population variant (implemented via the <italic>dplyr</italic> package [<xref ref-type="bibr" rid="ref108">108</xref>] in R software [R Foundation for Statistical Computing]), which returned those records from the synthetic data that did not have an exact match in the corresponding population variant. This definition is functionally equivalent to a record-level Hamming distance of more than 0 from the synthetic to the closest real record. However, rather than calculating row-wise distances, we used exact match comparison, which is computationally simpler and more efficient. Treatment of missing values and numerical variables is detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>The parameter of interest for this study was the hallucination rate (HR) in a synthetic dataset, defined as follows:</p>
        <disp-formula>
          <graphic xlink:href="jmir_v27i1e77893_fig7.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </disp-formula>
        <p>whereby &#124;<italic>HA</italic>&#124; was the number of hallucinated records and &#124;<italic>S</italic>&#124; was the size of the synthetic dataset (ie, 10,000 records). The HR was averaged across the 10 synthetic datasets per trained SDG model.</p>
      </sec>
      <sec>
        <title>Downstream Task Performance</title>
        <p>Downstream utility was defined as prognostic AI and ML modeling performance and was assessed by train-synthetic-test-real (TSTR) utility [<xref ref-type="bibr" rid="ref109">109</xref>]. TSTR utility is when a prediction model is trained on the synthetic data and then tested on unseen real records (ie, holdout dataset) to see if it can make correct predictions [<xref ref-type="bibr" rid="ref109">109</xref>]. Accurately modeling a population is the very aim of research, and TSTR is a very meaningful metric to evaluate the utility of a synthetic dataset.</p>
        <p>The holdout dataset was composed of 10,000 random records, disjoint from the training dataset and fixed across all population variants for each real-world health care population to allow for comparability across the variants and between synthetic and real data. This corresponds to a 50:50 split for the training and holdout datasets. Importantly, to avoid any information leakage, the holdout dataset was not only independent from prognostic model training but also from SDG model training. To investigate the sensitivity to the single 50:50 split, the downstream performance of the real data was calculated over 10 additional splits. Results are detailed in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> and show that there was little variation across the splits.</p>
        <p>All reference populations came with a predefined binary classification task involving the <italic>core</italic> variables. A binary classification model was built using LGBM, which is a commonly applied ML prediction model [<xref ref-type="bibr" rid="ref110">110</xref>,<xref ref-type="bibr" rid="ref111">111</xref>]. Tree-based models are the most common type of ML prognostic methods used in clinical research [<xref ref-type="bibr" rid="ref112">112</xref>]; they perform better than linear models, such as logistic regression [<xref ref-type="bibr" rid="ref113">113</xref>-<xref ref-type="bibr" rid="ref117">117</xref>], and have also been found to perform better than deep learning models on tabular datasets [<xref ref-type="bibr" rid="ref118">118</xref>,<xref ref-type="bibr" rid="ref119">119</xref>]. In addition, we trained an MLP to account for contemporary neural network classification approaches. Model performance was assessed as the area under the receiver operating characteristic curve (AUROC) [<xref ref-type="bibr" rid="ref120">120</xref>] and averaged across the 10 synthetic datasets per trained SDG model.</p>
        <p>In LGBM, hyperparameters were chosen based on AUROC in 5-fold cross-validation during model training [<xref ref-type="bibr" rid="ref121">121</xref>]. Details with respect to the implementation and hyperparameters to select from are described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>In MLP, we built a sequential classification model with an input layer with 16 nodes, a dropout layer, a second hidden layer with 16 nodes, and an output layer with 1 node and a sigmoid activation function for binary classification. Extensive hyperparameter tuning was not conducted, as exploratory results already demonstrated that this setup yields comparable results to LGBM. We focused instead on avoiding overfitting [<xref ref-type="bibr" rid="ref40">40</xref>]. Details with respect to the implementation and overfitting avoidance are described in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>In addition, we measured performance when using the real (training) dataset for prognostic modeling (ie, train real test real). This gave us the performance that would be possible when using real data instead of synthetic data and served as a reference point.</p>
        <p>In total, 451,134 LGBM models and 451,134 MLP models were trained.</p>
      </sec>
      <sec>
        <title>Effect Estimation</title>
        <p>We analyzed the association between data complexity and hallucinations (ie, HR) as well as hallucinations and downstream utility (ie, TSTR). We estimated the effect for each SDG model separately.</p>
        <p>Initial modeling results suggested that there was an unobserved (ie, random) effect beyond complexity contributing to HR and an effect beyond HR contributing to TSTR. This can be explained by the unique distribution, unique <italic>core</italic> variables, and the specific downstream tasks of each of the 12 populations.</p>
        <p>Random effects can be captured by mixed-effects models. Such models assess a fixed component while accounting for a random component. In this study, the random component was the provenance of the population variant, which was the 12 health care populations. We estimated the fixed effect of complexity on the outcome HR as well as the fixed effect of HR on the outcome TSTR. When estimating the effect of HR on TSTR, we only considered those populations with sufficient spread in the HR across all population variants, more precisely, where the difference between the 10th and 90th percentiles of HR was at least 0.25. Details on the models and their implementation are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
        <p>The level of significance was chosen to be .05. The odds ratio (OR) with respective 95% CI is reported as effect size for generalized linear mixed-effects models and the coefficient (or effect estimate) with respective 95% CI for linear mixed-effects models. We evaluated model fit using marginal and conditional <italic>R</italic><sup>2</sup> values. These quantify the variance explained by the fixed effects alone and by both fixed and random effects [<xref ref-type="bibr" rid="ref122">122</xref>].</p>
        <p>Given the large scale of our experiments, an important question is whether such a large number of population variants is needed to estimate the effects as described earlier. These secondary (or sensitivity) analyses confirmed the robustness of effect estimation but, more importantly, can inform potential design adjustments in terms of scale in future methodological research. They were conducted by randomly selecting 50% and 25% of the population variants for each reference population. More precisely, from the entire set of population variants per real-world reference population, we chose a random subset of 50% and 25% and used the mixed-effects models as described earlier to estimate the fixed effects of complexity on the outcome HR as well as the fixed effects of HR on the outcome TSTR. The detailed results are presented in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p>
      </sec>
      <sec>
        <title>Ethical Considerations</title>
        <p>This project has been approved by the Children’s Hospital of Eastern Ontario Research Institute Research Ethics Board (REB) protocol (24/103X).</p>
        <p>The Children’s Hospital of Eastern Ontario REB operates in compliance with, and is constituted in accordance with, the requirements of the Tri-Council Policy Statement: Ethical Conduct of Research Involving Humans [<xref ref-type="bibr" rid="ref123">123</xref>]; the International Conference on Harmonization Good Clinical Practice Consolidated Guideline [<xref ref-type="bibr" rid="ref124">124</xref>]; part C, division 5 of the Food and Drug Regulations [<xref ref-type="bibr" rid="ref125">125</xref>]; part 4 of the Natural Health Products Regulations [<xref ref-type="bibr" rid="ref126">126</xref>]; and part 3 of the Medical Devices Regulations [<xref ref-type="bibr" rid="ref127">127</xref>] and the provisions of the Ontario Personal Health Information Protection Act 2004 and its applicable regulations [<xref ref-type="bibr" rid="ref128">128</xref>].</p>
        <p>This research involved the secondary use of deidentified health care datasets originally collected for purposes other than this study. This made the potential of disclosure risks the primary ethical consideration of this study. However, all datasets were deidentified at the source by the respective data custodians and were assessed as low risk. All analyses were conducted within a secure server environment with access restricted to authorized researchers of this study. These researchers have completed institutional privacy and security training, including instruction on the appropriate handling of personal health information, and, where required by data custodians, researchers also agreed to specific terms of use and completed additional ethics or data governance training. In accordance with the Tri-Council Policy Statement: Ethical Conduct of Research Involving Humans [<xref ref-type="bibr" rid="ref123">123</xref>], individual reconsent was waived by the REB given that secondary use of deidentified data in this study posed minimal risk.</p>
      </sec>
    </sec>
    <sec sec-type="results">
      <title>Results</title>
      <sec>
        <title>Hallucinations During SDG</title>
        <p>We analyzed the HR when generating tabular health data. The minimum HR was 0.3%, and the maximum HR was 100%. We found that the median (99.1%, IQR 98.5%-100.0%) HR across all synthetic datasets was very high. This finding remained consistent when applying an alternative operational implementation of the HR (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <p>Complexity had a fixed effect on the HR via generalized linear mixed-effects modeling with the population as a random effect. More precisely, for each SDG model, there was a significant positive association between the complexity of the (training) data and the HR. The OR ranged from 1.07 (95% CI 1.03-1.11) in ST to 1.16 (95% CI 1.11-1.22) in normalizing flows. As shown in <xref ref-type="table" rid="table2">Table 2</xref>, the contribution of the HR as a fixed effect to the explained variance varied across the SDG models, and the random effect was consistently a large part of the total explained variance.</p>
        <p>In <xref rid="figure4" ref-type="fig">Figure 4</xref>, the behavior of the SDG model with the lowest HR (ie, ST) is illustrated across the different populations. Notably, the effect can add up considerably with increasing complexity. As shown in <xref ref-type="table" rid="table2">Table 2</xref>, this effect is similar for the other SDG models.</p>
        <p>The fixed effect of complexity on the HR was also modeled with fewer population variants (ie, a 50% and 25% subset) as a sensitivity analysis to the sample size. The effect sizes of these sensitivity analyses were very similar to the main analysis, confirming the robustness of our results in a smaller-scale evaluation setup (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>).</p>
        <table-wrap position="float" id="table2">
          <label>Table 2</label>
          <caption>
            <p>Modeling the effect of complexity on the hallucination rate<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <col width="200"/>
            <thead>
              <tr valign="top">
                <td>SDG<sup>b</sup> model</td>
                <td>Fixed effect complexity, OR<sup>c</sup> (95% CI)</td>
                <td><italic>P</italic> value</td>
                <td><italic>R</italic><sup>2</sup> (fixed effect)</td>
                <td><italic>R</italic><sup>2</sup> (overall)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>ST<sup>d</sup></td>
                <td>1.07 (1.03-1.11)</td>
                <td>&#60;.001</td>
                <td>0.26</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>BN<sup>e</sup></td>
                <td>1.03 (1.01-1.05)</td>
                <td>.001</td>
                <td>0.16</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>ARF<sup>f</sup></td>
                <td>1.07 (1.03-1.12)</td>
                <td>&#60;.001</td>
                <td>0.29</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>CTGAN<sup>g</sup></td>
                <td>1.11 (1.08-1.14)</td>
                <td>&#60;.001</td>
                <td>0.57</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>TVAE<sup>h</sup></td>
                <td>1.11 (1.07-1.15)</td>
                <td>&#60;.001</td>
                <td>0.47</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>RTVAE<sup>i</sup></td>
                <td>1.16 (1.10-1.23)</td>
                <td>&#60;.001</td>
                <td>0.45</td>
                <td>0.99</td>
              </tr>
              <tr valign="top">
                <td>NFlow<sup>j</sup></td>
                <td>1.16 (1.11-1.22)</td>
                <td>&#60;.001</td>
                <td>0.54</td>
                <td>0.99</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table2fn1">
              <p><sup>a</sup>Generalized linear mixed-effect models were fitted for each synthetic data generation model separately, with the following number of observations: 6354 for sequential decision trees; 6354 for Bayesian network; 6354 for adversarial random forests; 6354 for conditional generative adversarial network; 6354 for tabular variational autoencoder; 6353 for robust tabular variational autoencoder; and 6328 for normalizing flow. The population was considered as a random effect, complexity as a fixed effect, and the HR as an outcome. The odds ratios for hallucinations are indicated. We provide the variance explained (ie, <italic>R</italic><sup>2</sup>) by the fixed effect only and by both fixed and marginal effects together (ie, <italic>R</italic><sup>2</sup> overall) for all models.</p>
            </fn>
            <fn id="table2fn2">
              <p><sup>b</sup>SDG: synthetic data generation.</p>
            </fn>
            <fn id="table2fn3">
              <p><sup>c</sup>OR: odds ratio.</p>
            </fn>
            <fn id="table2fn4">
              <p><sup>d</sup>ST: sequential decision tree.</p>
            </fn>
            <fn id="table2fn5">
              <p><sup>e</sup>BN: Bayesian network.</p>
            </fn>
            <fn id="table2fn6">
              <p><sup>f</sup>ARF: adversarial random forest.</p>
            </fn>
            <fn id="table2fn7">
              <p><sup>g</sup>CTGAN: conditional generative adversarial network.</p>
            </fn>
            <fn id="table2fn8">
              <p><sup>h</sup>TVAE: tabular variational autoencoder.</p>
            </fn>
            <fn id="table2fn9">
              <p><sup>i</sup>RTVAE: robust tabular variational autoencoder.</p>
            </fn>
            <fn id="table2fn10">
              <p><sup>j</sup>NFlow: normalizing flow.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <fig id="figure4" position="float">
          <label>Figure 4</label>
          <caption>
            <p>Mixed-effects model with the population as a random effect, complexity as a fixed effect, and hallucination rate (HR) as an outcome for the synthetic data generation (SDG) model sequential decision trees (STs). HR in synthetic datasets was determined as described and averaged across the 10 synthetic datasets per trained SDG model. Complexity for a dataset was calculated as the log sum of its variables’ cardinalities. The lines are the predicted HR by the mixed-effects model, while the points are the observed HR. BORN: Better Outcomes Registry &#38; Network; CCHS: Canadian Community Health Survey; FAERS: US Food and Drug Administration Adverse Event Reporting System; MIMIC-III: Medical Information Mart for Intensive Care III.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e77893_fig4.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
      </sec>
      <sec>
        <title>Downstream Prognostic AI and ML Modeling</title>
        <p>Once the occurrence of hallucinations in tabular synthetic health care data was confirmed, we analyzed the effect of HR on downstream utility. The downstream task was prognostic AI and ML modeling, and performance was measured by AUROC when LGBM and MLP models were trained on synthetic and tested on real data (ie, TSTR).</p>
        <p>In general, the median deviation of the AI and ML performance derived from the synthetic data (ie, TSTR) from the one derived from the real data (ie, train real test real) was low across all health care populations (<xref ref-type="table" rid="table3">Table 3</xref>). Notably, in the Nexoid population, most prognostic MLP models trained on synthetic data outperformed the model trained on real data (refer to <xref ref-type="table" rid="table3">Table 3</xref> and green vs dashed gray lines in <xref rid="figure4" ref-type="fig">Figure 4</xref>).</p>
        <p>Train real test real was calculated across 10 additional training-holdout splits to investigate sensitivity to the stochasticity of the data partitioning. The variation was very small for LGBM across all populations and also for MLP, except in the US Food and Drug Administration Adverse Event Reporting System (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). This indicates that performance was generally robust and insensitive to the particular data split used.</p>
        <table-wrap position="float" id="table3">
          <label>Table 3</label>
          <caption>
            <p>Downstream prognostic artificial intelligence and machine learning modeling performance<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="100"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <col width="0"/>
            <col width="150"/>
            <col width="150"/>
            <col width="150"/>
            <thead>
              <tr valign="top">
                <td>Population</td>
                <td colspan="4">LGBM<sup>b</sup></td>
                <td colspan="3">MLP<sup>c</sup></td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>TRTR<sup>d</sup>, median (IQR)</td>
                <td>TSTR<sup>e</sup>, median (IQR)</td>
                <td>TRTR-TSTR, median (IQR)</td>
                <td colspan="2">TRTR, median (IQR)</td>
                <td>TSTR, median (IQR)</td>
                <td>TRTR-TSTR, median (IQR)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td>BORN<sup>f</sup></td>
                <td>0.923 (0.922 to 0.924)</td>
                <td>0.906 (0.899 to 0.911)</td>
                <td>0.016 (0.011 to 0.025)</td>
                <td colspan="2">0.896 (0.896 to 0.896)</td>
                <td>0.864 (0.850 to 0.876)</td>
                <td>0.032 (0.020 to 0.046)</td>
              </tr>
              <tr valign="top">
                <td>California</td>
                <td>0.810 (0.809 to 0.812)</td>
                <td>0.666 (0.631 to 0.721)</td>
                <td>0.144 (0.089 to 0.176)</td>
                <td colspan="2">0.854 (0.854 to 0.854)</td>
                <td>0.824 (0.804 to 0.839)</td>
                <td>0.030 (0.015 to 0.050)</td>
              </tr>
              <tr valign="top">
                <td>CCHS<sup>g</sup></td>
                <td>0.708 (0.706 to 0.710)</td>
                <td>0.664 (0.639 to 0.682)</td>
                <td>0.043 (0.026 to 0.068)</td>
                <td colspan="2">0.698 (0.698 to 0.698)</td>
                <td>0.694 (0.688 to 0.698)</td>
                <td>0.004 (0.000 to 0.010)</td>
              </tr>
              <tr valign="top">
                <td>COVID-19</td>
                <td>0.957 (0.954 to 0.959)</td>
                <td>0.771 (0.609 to 0.917)</td>
                <td>0.187 (0.038 to 0.349)</td>
                <td colspan="2">0.931 (0.931 to 0.931)</td>
                <td>0.740 (0.661 to 0.829)</td>
                <td>0.192 (0.103 to 0.270)</td>
              </tr>
              <tr valign="top">
                <td>FAERS<sup>h</sup></td>
                <td>0.663 (0.652 to 0.675)</td>
                <td>0.557 (0.538 to 0.574)</td>
                <td>0.105 (0.086 to 0.127)</td>
                <td colspan="2">0.928 (0.928 to 0.928)</td>
                <td>0.818 (0.770 to 0.863)</td>
                <td>0.110 (0.064 to 0.157)</td>
              </tr>
              <tr valign="top">
                <td>Florida</td>
                <td>0.750 (0.748 to 0.751)</td>
                <td>0.622 (0.596 to 0.644)</td>
                <td>0.127 (0.106 to 0.154)</td>
                <td colspan="2">0.837 (0.837 to 0.837)</td>
                <td>0.811 (0.789 to 0.825)</td>
                <td>0.026 (0.011 to 0.048)</td>
              </tr>
              <tr valign="top">
                <td>MIMIC-III<sup>i</sup></td>
                <td>0.654 (0.653 to 0.658)</td>
                <td>0.561 (0.547 to 0.571)</td>
                <td>0.094 (0.080 to 0.108)</td>
                <td colspan="2">0.534 (0.534 to 0.534)</td>
                <td>0.527 (0.522 to 0.533)</td>
                <td>0.008 (0.002 to 0.013)</td>
              </tr>
              <tr valign="top">
                <td>New York</td>
                <td>0.806 (0.801 to 0.806)</td>
                <td>0.651 (0.626 to 0.686)</td>
                <td>0.153 (0.118 to 0.178)</td>
                <td colspan="2">0.859 (0.859 to 0.859)</td>
                <td>0.832 (0.811 to 0.848)</td>
                <td>0.027 (0.012 to 0.049)</td>
              </tr>
              <tr valign="top">
                <td>Nexoid</td>
                <td>0.730 (0.729 to 0.731)</td>
                <td>0.676 (0.662 to 0.702)</td>
                <td>0.054 (0.029 to 0.068)</td>
                <td colspan="2">0.681 (0.681 to 0.681)</td>
                <td>0.683 (0.671 to 0.692)</td>
                <td>–0.002 (–0.011 to 0.010)</td>
              </tr>
              <tr valign="top">
                <td>Texas</td>
                <td>0.810 (0.808 to 0.811)</td>
                <td>0.747 (0.720 to 0.762)</td>
                <td>0.062 (0.048 to 0.090)</td>
                <td colspan="2">0.813 (0.813 to 0.813)</td>
                <td>0.788 (0.778 to 0.800)</td>
                <td>0.025 (0.012 to 0.035)</td>
              </tr>
              <tr valign="top">
                <td>Washington</td>
                <td>0.784 (0.782 to 0.788)</td>
                <td>0.650 (0.617 to 0.679)</td>
                <td>0.135 (0.105 to 0.167)</td>
                <td colspan="2">0.870 (0.870 to 0.870)</td>
                <td>0.844 (0.831 to 0.852)</td>
                <td>0.026 (0.017 to 0.038)</td>
              </tr>
              <tr valign="top">
                <td>Washington 2008</td>
                <td>0.808 (0.806 to 0.810)</td>
                <td>0.684 (0.649 to 0.709)</td>
                <td>0.125 (0.100 to 0.160)</td>
                <td colspan="2">0.877 (0.877 to 0.877)</td>
                <td>0.843 (0.827 to 0.857)</td>
                <td>0.034 (0.019 to 0.050)</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table3fn1">
              <p><sup>a</sup>The different downstream tasks achieved varying performance in the real data (train real test real). The deviation of the performance derived from the synthetic data (train synthetic test real) is indicated as TRTR-TSTR. Performance was measured as the area under the receiver operating characteristic curve. The train synthetic test real is summarized across all synthetic data generation models.</p>
            </fn>
            <fn id="table3fn2">
              <p><sup>b</sup>LGBM: light gradient boosted decision tree.</p>
            </fn>
            <fn id="table3fn3">
              <p><sup>c</sup>MLP: multilayer perceptron.</p>
            </fn>
            <fn id="table3fn4">
              <p><sup>d</sup>TRTR: train real test real.</p>
            </fn>
            <fn id="table3fn5">
              <p><sup>e</sup>TSTR: train synthetic test real.</p>
            </fn>
            <fn id="table3fn6">
              <p><sup>f</sup>BORN: Better Outcomes Registry &#38; Network.</p>
            </fn>
            <fn id="table3fn7">
              <p><sup>g</sup>CCHS: Canadian Community Health Survey.</p>
            </fn>
            <fn id="table3fn8">
              <p><sup>h</sup>FAERS: US Food and Drug Administration Adverse Event Reporting System.</p>
            </fn>
            <fn id="table3fn9">
              <p><sup>i</sup>MIMIC-III: Medical Information Mart for Intensive Care III.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>To detect a trend in HR on TSTR, we focused on those populations where the HR differed across the variants at least by 0.25 between the 10th and 90th percentiles. While TSTR was computed for all synthetic datasets, this filtering step reduced the subset used for effect modeling to 19.71% (8766/44,478 trained SDG models).</p>
        <p>Among these, TSTR from LGBM was not affected by HR in most SDG models (6/7, 86%). Only the conditional generative adversarial network showed a significant decrease in prognostic LGBM modeling performance with increasing HR. The effect estimate was –0.0002 (95% CI –0.0003 to –0.0002) per percent point in HR, which in the most extreme case (ie, 100% HR) would only result in a decrease in AUROC of 0.02. Similarly, the TSTR from MLP was not affected by HR in most models (5/7, 71%). In adversarial random forest and robust tabular variational autoencoder, there was a significant negative association with, again, very small effect estimates (OR –0.0001, 95% CI –0.0002 to –0.0001 and OR –0.0001, 95% CI –0.0001 to 0.0000, respectively). Consistent with these findings, the variance explained by the fixed effect was negligible across all SDG models, and in most models, the random effect explained most of the variance (<xref ref-type="table" rid="table4">Table 4</xref>).</p>
        <table-wrap position="float" id="table4">
          <label>Table 4</label>
          <caption>
            <p>Modeling the effect of hallucination rate (HR) on the downstream performance<sup>a</sup>.</p>
          </caption>
          <table width="1000" cellpadding="5" cellspacing="0" border="1" rules="groups" frame="hsides">
            <col width="30"/>
            <col width="340"/>
            <col width="300"/>
            <col width="100"/>
            <col width="130"/>
            <col width="100"/>
            <thead>
              <tr valign="top">
                <td colspan="2">SDG<sup>b</sup> model and AI<sup>c</sup> and ML<sup>d</sup> model</td>
                <td>Fixed effect HR, OR<sup>e</sup> (95% CI)</td>
                <td><italic>P</italic> value</td>
                <td><italic>R</italic><sup>2</sup> (fixed effect)</td>
                <td><italic>R</italic><sup>2</sup> (overall)</td>
              </tr>
            </thead>
            <tbody>
              <tr valign="top">
                <td colspan="6">
                  <bold>ST<sup>f</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LGBM<sup>g</sup></td>
                <td>0.0000 (–0.0001 to 0.0001)</td>
                <td>.70</td>
                <td>0.0000</td>
                <td>0.9962</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MLP<sup>h</sup></td>
                <td>–0.0001 (–0.0001 to 0.0000)</td>
                <td>.18</td>
                <td>0.0003</td>
                <td>0.9911</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>BN<sup>i</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LGBM</td>
                <td>0.0000 (–0.0001 to 0.0002)</td>
                <td>.74</td>
                <td>0.0000</td>
                <td>0.9932</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MLP</td>
                <td>0.0003 (0.0001 to 0.0005)</td>
                <td>.10</td>
                <td>0.0029</td>
                <td>0.9662</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>ARF<sup>j</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LGBM</td>
                <td>–0.0001 (–0.0002 to 0.0000)</td>
                <td>.15</td>
                <td>0.0003</td>
                <td>0.9985</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>MLP<sup>k</sup></italic>
                </td>
                <td>–<italic>0.0001</italic> <italic>(</italic>–<italic>0.0002</italic> <italic>to</italic> –<italic>0.0001</italic><italic>)</italic></td>
                <td>
                  <italic>.002</italic>
                </td>
                <td>
                  <italic>0.0007</italic>
                </td>
                <td>
                  <italic>0.9918</italic>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>CTGAN<sup>l</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>LGBM</italic>
                </td>
                <td>–<italic>0.0002</italic> <italic>(</italic>–<italic>0.0003</italic> <italic>to</italic> <italic>−0.0002</italic><italic>)</italic></td>
                <td>
                  <italic>&#60;.001</italic>
                </td>
                <td>
                  <italic>0.0007</italic>
                </td>
                <td>
                  <italic>0.9905</italic>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MLP</td>
                <td>–0.0001 (–0.0003 to 0.0002)</td>
                <td>.70</td>
                <td>0.0001</td>
                <td>0.9866</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>TVAE<sup>m</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LGBM</td>
                <td>0.0004 (–0.0003 to 0.0011)</td>
                <td>.40</td>
                <td>0.0050</td>
                <td>0.9778</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MLP</td>
                <td>0.0000 (–0.0001 to 0.0001)</td>
                <td>.80</td>
                <td>0.0000</td>
                <td>0.9784</td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>RTVAE<sup>n</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LGBM</td>
                <td>0.0000 (−0.0002 to 0.0001)</td>
                <td>.76</td>
                <td>0.0000</td>
                <td>0.9683</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>
                  <italic>MLP</italic>
                </td>
                <td>–<italic>0.0001 (</italic>–<italic>0.0001 to 0.0000)</italic></td>
                <td>
                  <italic>.007</italic>
                </td>
                <td>
                  <italic>0.0003</italic>
                </td>
                <td>
                  <italic>0.9730</italic>
                </td>
              </tr>
              <tr valign="top">
                <td colspan="6">
                  <bold>NFlow<sup>o</sup></bold>
                </td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>LGBM</td>
                <td>−0.0016 (−0.0038 to 0.0006)</td>
                <td>.14</td>
                <td>0.0675</td>
                <td>0.0675</td>
              </tr>
              <tr valign="top">
                <td>
                  <break/>
                </td>
                <td>MLP</td>
                <td>0.0004 (−0.0015 to 0.0022)</td>
                <td>.70</td>
                <td>0.0049</td>
                <td>0.0049</td>
              </tr>
            </tbody>
          </table>
          <table-wrap-foot>
            <fn id="table4fn1">
              <p><sup>a</sup>Linear mixed-effect models were fitted for each synthetic data generation model separately, with the following number of observations: 1962 for light gradient boosted decision tree and 1964 for multilayer perceptron for sequential decision trees; 1354 (light gradient boosted decision tree and multilayer perceptron) for Bayesian network; 1354 (light gradient boosted decision tree and multilayer perceptron) for adversarial random forest; 1354 (light gradient boosted decision tree and multilayer perceptron) for conditional generative adversarial network; 1352 (light gradient boosted decision tree) and 1354 (multilayer perceptron) for tabular variational autoencoder; 1349 (light gradient boosted decision tree) and 1354 (multilayer perceptron) for robust tabular variational autoencoder; and 32 (light gradient boosted decision tree and multilayer perceptron) for normal flow. Health care populations were considered as random effects, HR as fixed effects, and the train synthetic test real as an outcome. Both light gradient boosted decision tree and multilayer perceptron are considered. The coefficients for the HR in percentages are indicated. We provide the variance explained (ie, <italic>R</italic><sup>2</sup>) by the fixed effect only and by both fixed and marginal effects together (ie, <italic>R</italic><sup>2</sup> overall) for all models. For normal flow, there was no random effect since only one health care population met the requirements of HR range; therefore, <italic>R</italic><sup>2</sup> and <italic>R</italic><sup>2</sup> overall are identical.</p>
            </fn>
            <fn id="table4fn2">
              <p><sup>b</sup>SDG: synthetic data generation.</p>
            </fn>
            <fn id="table4fn3">
              <p><sup>c</sup>AI: artificial intelligence.</p>
            </fn>
            <fn id="table4fn4">
              <p><sup>d</sup>ML: machine learning.</p>
            </fn>
            <fn id="table4fn5">
              <p><sup>e</sup>OR: odds ratio.</p>
            </fn>
            <fn id="table4fn6">
              <p><sup>f</sup>ST: sequential decision tree.</p>
            </fn>
            <fn id="table4fn7">
              <p><sup>g</sup>LGBM: light gradient boosted decision tree.</p>
            </fn>
            <fn id="table4fn8">
              <p><sup>h</sup>MLP: multilayer perceptron.</p>
            </fn>
            <fn id="table4fn9">
              <p><sup>i</sup>BN: Bayesian network.</p>
            </fn>
            <fn id="table4fn10">
              <p><sup>j</sup>ARF: adversarial random forest.</p>
            </fn>
            <fn id="table4fn11">
              <p><sup>k</sup>Italicized text indicates models with <italic>P</italic>&#60;.05.</p>
            </fn>
            <fn id="table4fn12">
              <p><sup>l</sup>CTGAN: conditional generative adversarial network.</p>
            </fn>
            <fn id="table4fn13">
              <p><sup>m</sup>TVAE: tabular variational autoencoder.</p>
            </fn>
            <fn id="table4fn14">
              <p><sup>n</sup>RTVAE: robust tabular variational autoencoder.</p>
            </fn>
            <fn id="table4fn15">
              <p><sup>o</sup>NFlow: normal flow.</p>
            </fn>
          </table-wrap-foot>
        </table-wrap>
        <p>In <xref rid="figure5" ref-type="fig">Figure 5</xref>, the prognostic AI and ML model performance trend for the SDG model ST (the example shown in <xref rid="figure4" ref-type="fig">Figure 4</xref>) is illustrated across the different populations. ST generated synthetic variants only for Better Outcomes Registry &#38; Network, Nexoid, and Texas, with sufficient spread in the HR across variants. Results for the LGBM and the MLP models are presented. As shown in <xref ref-type="table" rid="table4">Table 4</xref>, this effect was similar for the other SDG models.</p>
        <fig id="figure5" position="float">
          <label>Figure 5</label>
          <caption>
            <p>Mixed-effects model with health care population as a random effect, hallucination rate (HR) as a fixed effect, and train synthetic test real (TSTR) as an outcome for the synthetic data generation (SDG) model sequential decision trees (STs). HR in synthetic datasets was determined as described and averaged across the 10 synthetic datasets per trained SDG model. TSTR for a dataset was measured as the area under the receiver operating characteristic curve (AUROC) for light gradient boosted decision tree (LGBM) and multilayer perceptron (MLP) models. The green line is the predicted AUROC by the mixed-effects model, while the points are the observed AUROC. The dashed gray line is the AUROC by train real test real (TRTR) when being trained on real data.</p>
          </caption>
          <graphic xlink:href="jmir_v27i1e77893_fig5.png" alt-version="no" mimetype="image" position="float" xlink:type="simple"/>
        </fig>
        <p>Again, these analyses were repeated with fewer population variants (ie, a 50% and 25% subset). While the prognostic AI and ML model performance across all downstream tasks was similar to the ones shown in <xref ref-type="table" rid="table3">Table 3</xref>, the estimated effect for the HR on downstream performance had slight differences. More importantly, the smaller-scale evaluations reduced the number of populations with sufficient spread in HR for modeling, with a resulting sparse coverage in the random effect.</p>
      </sec>
    </sec>
    <sec sec-type="discussion">
      <title>Discussion</title>
      <sec>
        <title>Principal Findings</title>
        <p>In this study, we examined hallucinations in synthetic tabular health data. In total, 12 large datasets were used in a simulation of the relationship between dataset complexity and the HR and the HR and the downstream binary prediction performance of the generated datasets.</p>
        <p>Our findings suggest that hallucinations can be very common in synthetic tabular health data and, as hypothesized in the Introduction section, depend on the dataset’s complexity. However, evidence from this study did not support the second hypothesis that the greater the rate of hallucinations, the less effective the prognostic models would be. This means that prognostic AI and ML modeling was not negatively (or positively) affected by increasing hallucinations in most cases. In those cases, where a negative trend was observed, this trend was negligibly small.</p>
      </sec>
      <sec>
        <title>Comparison to Prior Work</title>
        <p>To our knowledge, hallucinations in tabular synthetic data have not been systematically studied yet. While previous work on evaluating tabular synthetic data focused on utility, privacy, and fairness [<xref ref-type="bibr" rid="ref31">31</xref>,<xref ref-type="bibr" rid="ref32">32</xref>] without explicitly investigating hallucinations, this phenomenon has received considerable attention in generative text modeling. In this modality, hallucinations are typically seen as a major limitation [<xref ref-type="bibr" rid="ref13">13</xref>-<xref ref-type="bibr" rid="ref15">15</xref>].</p>
        <p>Intuitively, hallucinated tabular data can also pose limitations with the potential to degrade the performance of a prognostic AI and ML model because the model would learn patterns that are nonexistent in the population it is deployed on. However, our findings suggest that this is not the case.</p>
        <p>One potential explanation is that hallucinations may be mainly driven by statistically independent variables that are not associated with the outcome and thus less relevant for prognostic AI and ML modeling. If synthetic records have an invalid combination of values for such variables, they are hallucinated but can still preserve valid combinations of values for variables that are relevant to prognostic modeling. In addition, high-cardinality variables may have long-tailed distributions, meaning that some categories are very rare. Hallucinations that affect these rare categories would contribute little to the overall predictive performance: If the prediction algorithm does not learn these rare (hallucinated) values because they are in the long tail, then the impact on predictions on unseen data will be minimal. If it does memorize them, the impact will still be minimal as these specific values are unlikely to appear in unseen data.</p>
        <p>While hallucinations may not impact AI and ML modeling performance, their negative perception in previous work offers an important insight; they can still have a nontrivial impact on the trust in and acceptance of SDG by clinicians and researchers. In a sensitive sector such as health care, trust has been shown to be crucial for technology adoption [<xref ref-type="bibr" rid="ref129">129</xref>]. In the context of trust, hallucinated records that violate real-world constraints, such as, female patients with prostate cancer or a young adult with a residency in a retirement home seem more severe than hallucinated patients that do not exist in a certain population but are, in theory, plausible patients (eg, a male patient with breast cancer). Fidelity metrics based on marginal or multivariate distributions are not designed to detect such violations. This means, as part of a trust-building exercise, it would be very valid and important to check synthetic datasets for such obvious real-world constraints, although they do not necessarily impact prognostic AI and ML model performance.</p>
      </sec>
      <sec>
        <title>Strengths and Limitations</title>
        <p>This study explored hallucinations in synthetic health care data and their impact on prognostic AI and ML model performance. To our knowledge, this is the first study investigating hallucinations in tabular synthetic data in a large-scale methodological setup, including 6354 SDG training datasets derived from 12 real-world health care reference populations, 7 state-of-the-art SDG models, and 2 widely used prognostic AI and ML models as downstream tasks. Secondary analyses using only 50% and 25% of the population variants suggest that smaller-scale designs may be feasible when the population variants exhibit sufficient spread in the HR to detect meaningful trends.</p>
        <p>Nevertheless, there are some limitations to highlight.</p>
        <p>First, our definition of hallucinations provided one implementation of the concept of <italic>factuality</italic>. However, there may be other approaches for defining hallucinated records in synthetic data. For example, another option would be to search for violations of real-world constraints as mentioned previously (eg, prostate cancer in female patients), which could be described as hallucinations based on clinical plausibility. We decided not to rely on such a definition for two reasons: (1) the definition of real-world constraints requires a high degree of domain expertise specific to each dataset and (2) such implausible records would be a subset of nonexistent records. Our definition was consequently broader, capturing implausible records as well as other nonexistent records. Another definition could allow for or focus more on the distribution than on record-level similarity (ie, hallucination as distribution shift or based upon probabilistic similarity). Again, this is very likely a less sensitive definition in that it does not label nonexistent records as hallucinated, provided they match the underlying distribution. Hallucinations may also be defined in terms of statistical associations or patterns whereby a substantially different (ie, stronger or weaker) association can be considered a hallucination. In addition, such definitions typically require the specification of a threshold that would be hard to justify and further complicate interpretation.</p>
        <p>Second, the choice of discretization in the implementation of our definition of hallucinations is ultimately dataset dependent and was informed by domain knowledge. In health care data, divergences in categorical versus numerical variables carry fundamentally different interpretations that should be accounted for in a distance-based definition of hallucinations. However, it must be noted that the number of bins can change the identification of hallucinations, with more bins increasing sensitivity and fewer bins introducing more tolerance with the risk of underdetection. In addition, records with values at the boundary of the discretization bin could be misclassified as hallucinations. While this effect was low in our scenario, where datasets were primarily categorical and the number of datasets under investigation was large (<xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), such an implementation could inflate the HR.</p>
        <p>Third, any definition of hallucination based on violations of factuality, as the one in this study and those described previously, requires access to ground truth or population data. This dependency makes it difficult to evaluate the HR for a specific synthetic dataset, as the population data are often not readily available. However, if hallucinations are conceptualized as substantially different statistical associations, then the replicability of population parameters may offer an operationalizable way to quantify hallucinations and is, in fact, a utility metric that is used in certain synthetic data use cases [<xref ref-type="bibr" rid="ref130">130</xref>].</p>
        <p>Fourth, the population variants used in this study were predominantly of higher complexity, with relatively few examples of low-complexity data. Therefore, the findings may be more representative of scenarios involving high-cardinality or high-dimensional data. However, these datasets are commonly used in clinical research, supporting the relevance of our findings to many real-world health care research scenarios. In addition, while the sampling of population variants was necessary to manage the large combinatory space, the sampled variants may not be representative of the entire combinatory space.</p>
        <p>Fifth, the downstream task under investigation was prognostic AI and ML modeling measured as AUROC. We applied 5-fold cross-validation to set hyperparameters for LGBM but did not perform exhaustive hyperparameter tuning for MLP. The default MLP settings already resulted in a performance that was comparable to and sometimes even outperformed LGBM, so that a different setup of hyperparameters was unlikely to relevantly improve performance, and we refrained from hyperparameter tuning for MLP. However, it may be valuable in other datasets.</p>
        <p>Finally, we were interested in prognostic AI and ML modeling. However, SDG has also been proposed as a privacy-enhancing technology in the context of clinical trials [<xref ref-type="bibr" rid="ref131">131</xref>]. Such a use case may be more sensitive to hallucinations if, for example, an external control arm is propensity score matched against the intervention arm. In contrast, descriptive statistics, particularly marginal distributions, are very likely not affected by hallucinations. Ultimately, however, it remains unclear at this stage which downstream tasks are most sensitive to hallucinated records, and their impact on specific use cases is speculative. Further systematic research is needed to identify which types of analyses are most vulnerable to hallucinations in synthetic tabular data.</p>
      </sec>
    </sec>
  </body>
  <back>
    <app-group>
      <supplementary-material id="app1">
        <label>Multimedia Appendix 1</label>
        <p>Data descriptions, supplemental methods, and supplemental results.</p>
        <media xlink:href="jmir_v27i1e77893_app1.pdf" xlink:title="PDF File  (Adobe PDF File), 902 KB"/>
      </supplementary-material>
    </app-group>
    <glossary>
      <title>Abbreviations</title>
      <def-list>
        <def-item>
          <term id="abb1">AI</term>
          <def>
            <p>artificial intelligence</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb2">AUROC</term>
          <def>
            <p>area under the receiver operating characteristic curve</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb3">HR</term>
          <def>
            <p>hallucination rate</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb4">LGBM</term>
          <def>
            <p>light gradient boosted decision tree</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb5">LLM</term>
          <def>
            <p>large language model</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb6">ML</term>
          <def>
            <p>machine learning</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb7">MLP</term>
          <def>
            <p>multilayer perceptron</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb8">OR</term>
          <def>
            <p>odds ratio</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb9">REB</term>
          <def>
            <p>Research Ethics Board</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb10">SDG</term>
          <def>
            <p>synthetic data generation</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb11">ST</term>
          <def>
            <p>sequential decision tree</p>
          </def>
        </def-item>
        <def-item>
          <term id="abb12">TSTR</term>
          <def>
            <p>train synthetic test real</p>
          </def>
        </def-item>
      </def-list>
    </glossary>
    <ack>
      <p>LP is funded by the Deutsche Forschungsgemeinschaft (German Research Foundation, #530282197). KEE is funded by the Canada Research Chairs Program through the Canadian Institutes of Health Research and a Discovery Grant (RGPIN-2022-04811) from the Natural Sciences and Engineering Research Council of Canada. DL is funded by the Canadian Children Inflammatory Bowel Disease Network.</p>
    </ack>
    <notes>
      <sec>
        <title>Data Availability</title>
        <p>Some datasets analyzed during this study are publicly available; some datasets analyzed during this study are not publicly available due to privacy. Details about data availability are provided in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> for each population. All original code for our analysis has been deposited in the Open Science Framework [<xref ref-type="bibr" rid="ref132">132</xref>].</p>
      </sec>
    </notes>
    <fn-group>
      <fn fn-type="con">
        <p>LP and KEE were involved in conceptualization, design, analysis, and drafting the manuscript. SEK and DL were involved in the provision of relevant synthetic data generation software and datasets. LP, KEE, DL, and SEK were involved in reviewing and editing the manuscript.</p>
      </fn>
      <fn fn-type="conflict">
        <p>At the time the study was conducted KEE was the scholar in residence at the office of the Information and Privacy Commissioner of Ontario and held shares in Aetion, which provided the sequential synthesis generative model software that was used in this study. At the time of publication both of these conflicts are no longer in effect. At the time of publication, KEE is the Editor-in-Chief of <italic>JMIR AI</italic></p>
      </fn>
    </fn-group>
    <ref-list>
      <ref id="ref1">
        <label>1</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Baker</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Kanade</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Hallucinating faces</article-title>
          <source>Proceedings 4th IEEE International Conference on Automatic Face and Gesture Recognition</source>
          <year>2000</year>
          <conf-name>AFGR '00</conf-name>
          <conf-date>March 28-30, 2000</conf-date>
          <conf-loc>Grenoble, France</conf-loc>
          <fpage>83</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/840616"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/afgr.2000.840603</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref2">
        <label>2</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cheng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Super-identity convolutional neural network for face hallucination</article-title>
          <source>Proceedings of the 15th European Conference on Computer Vision</source>
          <year>2018</year>
          <conf-name>ECCV '18</conf-name>
          <conf-date>September 8-14, 2018</conf-date>
          <conf-loc>Munich, Germany</conf-loc>
          <fpage>196</fpage>
          <lpage>211</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-030-01252-6_12"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/978-3-030-01252-6_12</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref3">
        <label>3</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chi</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Wu</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Generative facial prior embedded degradation adaption network for heterogeneous face hallucination</article-title>
          <source>Multimed Tools Appl</source>
          <year>2023</year>
          <month>10</month>
          <day>17</day>
          <volume>83</volume>
          <issue>15</issue>
          <fpage>43955</fpage>
          <lpage>81</lpage>
          <pub-id pub-id-type="doi">10.1007/S11042-023-16932-3</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref4">
        <label>4</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Wavelet domain generative adversarial network for multi-scale face hallucination</article-title>
          <source>Int J Comput Vis</source>
          <year>2019</year>
          <month>2</month>
          <day>12</day>
          <volume>127</volume>
          <issue>6-7</issue>
          <fpage>763</fpage>
          <lpage>84</lpage>
          <pub-id pub-id-type="doi">10.1007/S11263-019-01154-8</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref5">
        <label>5</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Tsang</surname>
              <given-names>IW</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Face hallucination with finishing touches</article-title>
          <source>IEEE Trans Image Process</source>
          <year>2021</year>
          <volume>30</volume>
          <fpage>1728</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.1109/tip.2020.3046918</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref6">
        <label>6</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marnerides</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Bashford-Rogers</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Debattista</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Deep HDR hallucination for inverse tone mapping</article-title>
          <source>Sensors (Basel)</source>
          <year>2021</year>
          <month>06</month>
          <day>11</day>
          <volume>21</volume>
          <issue>12</issue>
          <fpage>4032</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s21124032"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s21124032</pub-id>
          <pub-id pub-id-type="medline">34208062</pub-id>
          <pub-id pub-id-type="pii">s21124032</pub-id>
          <pub-id pub-id-type="pmcid">PMC8230591</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref7">
        <label>7</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Deep learning face hallucination via attributes transfer and enhancement</article-title>
          <source>Proceedings of the 2019 IEEE International Conference on Multimedia and Expo</source>
          <year>2019</year>
          <conf-name>ICME '19</conf-name>
          <conf-date>July 8-12, 2019</conf-date>
          <conf-loc>Shanghai, China</conf-loc>
          <fpage>604</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8785029/authors#authors"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref8">
        <label>8</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Pro-UIGAN: progressive face hallucination from occluded thumbnails</article-title>
          <source>IEEE Trans Image Process</source>
          <year>2022</year>
          <volume>31</volume>
          <fpage>3236</fpage>
          <lpage>50</lpage>
          <pub-id pub-id-type="doi">10.1109/tip.2022.3167280</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref9">
        <label>9</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>WZ</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>On potentials of regularized Wasserstein generative adversarial networks for realistic hallucination of tiny faces</article-title>
          <source>Neurocomputing</source>
          <year>2019</year>
          <month>10</month>
          <volume>364</volume>
          <fpage>1</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1016/j.neucom.2019.07.046</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref10">
        <label>10</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>WZ</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>JJ</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Ge</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>LQ</given-names>
            </name>
            <name name-style="western">
              <surname>Bao</surname>
              <given-names>BK</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>HB</given-names>
            </name>
          </person-group>
          <article-title>Tiny face hallucination via boundary equilibrium generative adversarial networks</article-title>
          <source>Proceedings of the 10th International Conference on Graphics and Image Processing</source>
          <year>2019</year>
          <conf-name>ICGIP 2018)</conf-name>
          <conf-date>December 12-14, 2018</conf-date>
          <conf-loc>Chengdu, China</conf-loc>
          <fpage>110693M</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.spiedigitallibrary.org/conference-proceedings-of-spie/11069/2524361/Tiny-face-hallucination-via-boundary-equilibrium-generative-adversarial-networks/10.1117/12.2524361.short"/>
          </comment>
          <pub-id pub-id-type="doi">10.1117/12.2524361</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref11">
        <label>11</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shao</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Tiny face hallucination via relativistic adversarial learning</article-title>
          <source>J Electron Inf Technol</source>
          <year>2021</year>
          <fpage>2577</fpage>
          <lpage>85</lpage>
          <pub-id pub-id-type="doi">10.11999/JEIT200362</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref12">
        <label>12</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ji</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Frieske</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Su</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Ishii</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bang</surname>
              <given-names>YJ</given-names>
            </name>
            <name name-style="western">
              <surname>Madotto</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Fung</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Survey of hallucination in natural language generation</article-title>
          <source>ACM Comput Surv</source>
          <year>2023</year>
          <month>03</month>
          <day>03</day>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>1</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.1145/3571730</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref13">
        <label>13</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Asgari</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Montaña-Brown</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Dubois</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Khalil</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Balloch</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yeung</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>Pimenta</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>A framework to assess clinical safety and hallucination rates of LLMs for medical text summarisation</article-title>
          <source>NPJ Digit Med</source>
          <year>2025</year>
          <month>05</month>
          <day>13</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>274</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-025-01670-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-025-01670-7</pub-id>
          <pub-id pub-id-type="medline">40360677</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-025-01670-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC12075489</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref14">
        <label>14</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maddox</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Babski</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Embi</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gerhart</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Goldsack</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Parikh</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Sarich</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <source>Generative Artificial Intelligence in Health and Medicine: Opportunities and Responsibilities for Transformative Innovation</source>
          <year>2025</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>National Academies Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref15">
        <label>15</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vrdoljak</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Boban</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Vilović</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kumrić</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Božić</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>A review of large language models in medical education, clinical decision support, and healthcare administration</article-title>
          <source>Healthcare (Basel)</source>
          <year>2025</year>
          <month>03</month>
          <day>10</day>
          <volume>13</volume>
          <issue>6</issue>
          <fpage>603</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=healthcare13060603"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/healthcare13060603</pub-id>
          <pub-id pub-id-type="medline">40150453</pub-id>
          <pub-id pub-id-type="pii">healthcare13060603</pub-id>
          <pub-id pub-id-type="pmcid">PMC11942098</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref16">
        <label>16</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Goldberg</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Kohane</surname>
              <given-names>I</given-names>
            </name>
          </person-group>
          <source>The AI revolution in medicine: GPT-4 and beyond</source>
          <year>2023</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>Pearson Education</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref17">
        <label>17</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bent</surname>
              <given-names>AA</given-names>
            </name>
          </person-group>
          <article-title>Large language models: AI's legal revolution</article-title>
          <source>Pace L Rev</source>
          <year>2023</year>
          <month>12</month>
          <day>20</day>
          <volume>44</volume>
          <issue>1</issue>
          <fpage>91</fpage>
          <pub-id pub-id-type="doi">10.58948/2331-3528.2083</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref18">
        <label>18</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Tan</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Westermann</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Benyekhlef</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT as an artificial lawyer?</article-title>
          <source>Proceedings of the 2023 International Conference and Workshop on Artificial Intelligence</source>
          <year>2023</year>
          <conf-name>JURIX '23</conf-name>
          <conf-date>June 19, 2023</conf-date>
          <conf-loc>Braga, Portugal</conf-loc>
          <fpage>25</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ceur-ws.org/Vol-3435/short2.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref19">
        <label>19</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Alkaissi</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>McFarlane</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Artificial hallucinations in ChatGPT: implications in scientific writing</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>02</month>
          <volume>15</volume>
          <issue>2</issue>
          <fpage>e35179</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36811129"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.35179</pub-id>
          <pub-id pub-id-type="medline">36811129</pub-id>
          <pub-id pub-id-type="pmcid">PMC9939079</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref20">
        <label>20</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Athaluri</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Manthena</surname>
              <given-names>SV</given-names>
            </name>
            <name name-style="western">
              <surname>Kesapragada</surname>
              <given-names>VS</given-names>
            </name>
            <name name-style="western">
              <surname>Yarlagadda</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Dave</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Duddumpudi</surname>
              <given-names>RT</given-names>
            </name>
          </person-group>
          <article-title>Exploring the boundaries of reality: investigating the phenomenon of artificial intelligence hallucination in scientific writing through ChatGPT references</article-title>
          <source>Cureus</source>
          <year>2023</year>
          <month>04</month>
          <volume>15</volume>
          <issue>4</issue>
          <fpage>e37432</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37182055"/>
          </comment>
          <pub-id pub-id-type="doi">10.7759/cureus.37432</pub-id>
          <pub-id pub-id-type="medline">37182055</pub-id>
          <pub-id pub-id-type="pmcid">PMC10173677</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref21">
        <label>21</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sharun</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Banu</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Pawde</surname>
              <given-names>AM</given-names>
            </name>
            <name name-style="western">
              <surname>Kumar</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Akash</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Dhama</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Pal</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>ChatGPT and artificial hallucinations in stem cell research: assessing the accuracy of generated references - a preliminary study</article-title>
          <source>Ann Med Surg (Lond)</source>
          <year>2023</year>
          <month>10</month>
          <volume>85</volume>
          <issue>10</issue>
          <fpage>5275</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37811040"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MS9.0000000000001228</pub-id>
          <pub-id pub-id-type="medline">37811040</pub-id>
          <pub-id pub-id-type="pii">AMSU-D-23-01385</pub-id>
          <pub-id pub-id-type="pmcid">PMC10553015</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref22">
        <label>22</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Proctor</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>B.C. lawyer reprimanded for citing fake cases invented by ChatGPT</article-title>
          <source>CBC News</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.cbc.ca/news/canada/british-columbia/lawyer-chatgpt-fake-precedent-1.7126393">https://www.cbc.ca/news/canada/british-columbia/lawyer-chatgpt-fake-precedent-1.7126393</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref23">
        <label>23</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Geroimenko</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Geroimenko</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Generative AI hallucinations in healthcare: a challenge for prompt engineering and creativity</article-title>
          <source>Human-Computer Creativity: Generative AI in Education, Art, and Healthcare</source>
          <year>2025</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>321</fpage>
          <lpage>35</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref24">
        <label>24</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vishwanath</surname>
              <given-names>PR</given-names>
            </name>
            <name name-style="western">
              <surname>Tiwari</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Naik</surname>
              <given-names>TG</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Thai</surname>
              <given-names>DN</given-names>
            </name>
          </person-group>
          <article-title>Faithfulness hallucination detection in healthcare AI</article-title>
          <source>OpenReview</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://openreview.net/forum?id=6eMIzKFOpJ">https://openreview.net/forum?id=6eMIzKFOpJ</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref25">
        <label>25</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kim</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jeong</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>SS</given-names>
            </name>
            <name name-style="western">
              <surname>Lu</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Alhamoud</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mun</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Grau</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Medical hallucinations in foundation models and their impact on healthcare</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on February 26, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2503.05777"/>
          </comment>
          <pub-id pub-id-type="doi">10.1101/2025.02.28.25323115</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref26">
        <label>26</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Walonoski</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kramer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Nichols</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Quina</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Moesel</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Hall</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Duffett</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Dube</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Gallagher</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>McLachlan</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Synthea: an approach, method, and software mechanism for generating synthetic patients and the synthetic electronic health care record</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2018</year>
          <month>03</month>
          <day>01</day>
          <volume>25</volume>
          <issue>3</issue>
          <fpage>230</fpage>
          <lpage>8</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/29025144"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocx079</pub-id>
          <pub-id pub-id-type="medline">29025144</pub-id>
          <pub-id pub-id-type="pii">4098271</pub-id>
          <pub-id pub-id-type="pmcid">PMC7651916</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref27">
        <label>27</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Jeanson</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Farkouh</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Godoy</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Minha</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Tzuman</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Marcus</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Medical calculators derived synthetic cohorts: a novel method for generating synthetic patient data</article-title>
          <source>Sci Rep</source>
          <year>2024</year>
          <month>05</month>
          <day>20</day>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>11437</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-024-61721-z"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-024-61721-z</pub-id>
          <pub-id pub-id-type="medline">38763934</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-024-61721-z</pub-id>
          <pub-id pub-id-type="pmcid">PMC11102910</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref28">
        <label>28</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Al-Dhamari</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Abu Attieh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Prasser</surname>
              <given-names>F</given-names>
            </name>
          </person-group>
          <article-title>Synthetic datasets for open software development in rare disease research</article-title>
          <source>Orphanet J Rare Dis</source>
          <year>2024</year>
          <month>07</month>
          <day>15</day>
          <volume>19</volume>
          <issue>1</issue>
          <fpage>265</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ojrd.biomedcentral.com/articles/10.1186/s13023-024-03254-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s13023-024-03254-2</pub-id>
          <pub-id pub-id-type="medline">39010138</pub-id>
          <pub-id pub-id-type="pii">10.1186/s13023-024-03254-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC11247768</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref29">
        <label>29</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Templ</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Meindl</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Kowarik</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dupriez</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <article-title>Simulation of synthetic complex data: the R package simPop</article-title>
          <source>J Stat Soft</source>
          <year>2017</year>
          <volume>79</volume>
          <issue>10</issue>
          <fpage>1</fpage>
          <lpage>38</lpage>
          <pub-id pub-id-type="doi">10.18637/jss.v079.i10</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref30">
        <label>30</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rineer</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kruskamp</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kery</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Hilscher</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Bobashev</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>A national synthetic populations dataset for the United States</article-title>
          <source>Sci Data</source>
          <year>2025</year>
          <month>01</month>
          <day>25</day>
          <volume>12</volume>
          <issue>1</issue>
          <fpage>144</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-025-04380-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-025-04380-7</pub-id>
          <pub-id pub-id-type="medline">39863626</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-025-04380-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC11762717</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref31">
        <label>31</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaabachi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Despraz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Meurers</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Otte</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Halilovic</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Kulynych</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Prasser</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Raisaro</surname>
              <given-names>JL</given-names>
            </name>
          </person-group>
          <article-title>A scoping review of privacy and utility metrics in medical synthetic data</article-title>
          <source>NPJ Digit Med</source>
          <year>2025</year>
          <month>01</month>
          <day>27</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>60</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-024-01359-3"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-024-01359-3</pub-id>
          <pub-id pub-id-type="medline">39870798</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41746-024-01359-3</pub-id>
          <pub-id pub-id-type="pmcid">PMC11772694</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref32">
        <label>32</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Vallevik</surname>
              <given-names>VB</given-names>
            </name>
            <name name-style="western">
              <surname>Babic</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Marshall</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Elvatun</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Brøgger</surname>
              <given-names>HM</given-names>
            </name>
            <name name-style="western">
              <surname>Alagaratnam</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Edwin</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Veeraragavan</surname>
              <given-names>NR</given-names>
            </name>
            <name name-style="western">
              <surname>Befring</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Nygård</surname>
              <given-names>JF</given-names>
            </name>
          </person-group>
          <article-title>Can I trust my fake data - a comprehensive quality assessment framework for synthetic tabular data in healthcare</article-title>
          <source>Int J Med Inform</source>
          <year>2024</year>
          <month>05</month>
          <volume>185</volume>
          <fpage>105413</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S1386-5056(24)00076-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.ijmedinf.2024.105413</pub-id>
          <pub-id pub-id-type="medline">38493547</pub-id>
          <pub-id pub-id-type="pii">S1386-5056(24)00076-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref33">
        <label>33</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Seven ways to evaluate the utility of synthetic data</article-title>
          <source>IEEE Secur Privacy</source>
          <year>2020</year>
          <month>7</month>
          <volume>18</volume>
          <issue>4</issue>
          <fpage>56</fpage>
          <lpage>9</lpage>
          <pub-id pub-id-type="doi">10.1109/msec.2020.2992821</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref34">
        <label>34</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>El-Hussuna</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Utility metrics for evaluating synthetic health data generation methods: validation study</article-title>
          <source>JMIR Med Inform</source>
          <year>2022</year>
          <month>04</month>
          <day>07</day>
          <volume>10</volume>
          <issue>4</issue>
          <fpage>e35734</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://medinform.jmir.org/2022/4/e35734/"/>
          </comment>
          <pub-id pub-id-type="doi">10.2196/35734</pub-id>
          <pub-id pub-id-type="medline">35389366</pub-id>
          <pub-id pub-id-type="pii">v10i4e35734</pub-id>
          <pub-id pub-id-type="pmcid">PMC9030990</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref35">
        <label>35</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaabachi</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Despraz</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Meurers</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>Can we trust synthetic data in medicine? A scoping review of privacy and utility metrics</article-title>
          <source>medRxiv</source>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.medrxiv.org/content/10.1101/2023.11.28.23299124v1"/>
          </comment>
          <comment>Preprint posted online on November 28, 2023</comment>
        </nlm-citation>
      </ref>
      <ref id="ref36">
        <label>36</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maynez</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Narayan</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Bohnet</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>On faithfulness and factuality in abstractive summarization</article-title>
          <source>Proceedings of the 58th Annual Meeting of the Association for Computational Linguistics</source>
          <year>2020</year>
          <conf-name>ACL '20</conf-name>
          <conf-date>July 5-10, 2020</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>1906</fpage>
          <lpage>19</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://aclanthology.org/2020.acl-main.173.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.18653/v1/2020.acl-main.454</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref37">
        <label>37</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>A mathematical investigation of hallucination and creativity in GPT models</article-title>
          <source>Mathematics</source>
          <year>2023</year>
          <month>05</month>
          <day>16</day>
          <volume>11</volume>
          <issue>10</issue>
          <fpage>2320</fpage>
          <pub-id pub-id-type="doi">10.3390/math11102320</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref38">
        <label>38</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>PH</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>How to develop machine learning models for healthcare</article-title>
          <source>Nat Mater</source>
          <year>2019</year>
          <month>05</month>
          <day>18</day>
          <volume>18</volume>
          <issue>5</issue>
          <fpage>410</fpage>
          <lpage>4</lpage>
          <pub-id pub-id-type="doi">10.1038/s41563-019-0345-0</pub-id>
          <pub-id pub-id-type="medline">31000806</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41563-019-0345-0</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref39">
        <label>39</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>An</surname>
              <given-names>Q</given-names>
            </name>
            <name name-style="western">
              <surname>Rahman</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Zhou</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kang</surname>
              <given-names>JJ</given-names>
            </name>
          </person-group>
          <article-title>A comprehensive review on machine learning in healthcare industry: classification, restrictions, opportunities and challenges</article-title>
          <source>Sensors (Basel)</source>
          <year>2023</year>
          <month>04</month>
          <day>22</day>
          <volume>23</volume>
          <issue>9</issue>
          <fpage>4178</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.mdpi.com/resolver?pii=s23094178"/>
          </comment>
          <pub-id pub-id-type="doi">10.3390/s23094178</pub-id>
          <pub-id pub-id-type="medline">37177382</pub-id>
          <pub-id pub-id-type="pii">s23094178</pub-id>
          <pub-id pub-id-type="pmcid">PMC10180678</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref40">
        <label>40</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kadra</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Lindauer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Hutter</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Grabocka</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Well-tuned simple nets excel on tabular datasets</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on June 21, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2106.11189"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref41">
        <label>41</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Valero De Bernabé</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Soriano</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Albaladejo</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Juarranz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Calle</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Martínez</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Domínguez-Rojas</surname>
              <given-names>V</given-names>
            </name>
          </person-group>
          <article-title>Risk factors for low birth weight: a review</article-title>
          <source>Eur J Obstet Gynecol Reprod Biol</source>
          <year>2004</year>
          <month>09</month>
          <day>10</day>
          <volume>116</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>15</lpage>
          <pub-id pub-id-type="doi">10.1016/j.ejogrb.2004.03.007</pub-id>
          <pub-id pub-id-type="medline">15294360</pub-id>
          <pub-id pub-id-type="pii">S0301211504001654</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref42">
        <label>42</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Yadav</surname>
              <given-names>DK</given-names>
            </name>
            <name name-style="western">
              <surname>Chaudhary</surname>
              <given-names>U</given-names>
            </name>
            <name name-style="western">
              <surname>Shrestha</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Risk factors associated with low birth weight</article-title>
          <source>J Nepal Health Res Counc</source>
          <year>2011</year>
          <month>10</month>
          <volume>9</volume>
          <issue>2</issue>
          <fpage>159</fpage>
          <lpage>64</lpage>
          <pub-id pub-id-type="medline">22929846</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref43">
        <label>43</label>
        <nlm-citation citation-type="web">
          <article-title>HCUP State Inpatient Databases (SID). Healthcare Cost and Utilization Project (HCUP)</article-title>
          <source>Agency for Healthcare Research and Quality</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://hcup-us.ahrq.gov/sidoverview.jsp">https://hcup-us.ahrq.gov/sidoverview.jsp</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref44">
        <label>44</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>França</surname>
              <given-names>UL</given-names>
            </name>
            <name name-style="western">
              <surname>McManus</surname>
              <given-names>ML</given-names>
            </name>
          </person-group>
          <article-title>Frequency, trends, and antecedents of severe maternal depression after three million U.S. births</article-title>
          <source>PLoS One</source>
          <year>2018</year>
          <volume>13</volume>
          <issue>2</issue>
          <fpage>e0192854</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0192854"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0192854</pub-id>
          <pub-id pub-id-type="medline">29444165</pub-id>
          <pub-id pub-id-type="pii">PONE-D-17-15893</pub-id>
          <pub-id pub-id-type="pmcid">PMC5812647</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref45">
        <label>45</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Brownlee</surname>
              <given-names>SA</given-names>
            </name>
            <name name-style="western">
              <surname>Blackwell</surname>
              <given-names>RH</given-names>
            </name>
            <name name-style="western">
              <surname>Blanco</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Zapf</surname>
              <given-names>MA</given-names>
            </name>
            <name name-style="western">
              <surname>Kliethermes</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>GN</given-names>
            </name>
            <name name-style="western">
              <surname>Kuo</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Kothari</surname>
              <given-names>AN</given-names>
            </name>
          </person-group>
          <article-title>Impact of post-hospital syndrome on outcomes following elective, ambulatory surgery</article-title>
          <source>Ann Surg</source>
          <year>2017</year>
          <month>08</month>
          <volume>266</volume>
          <issue>2</issue>
          <fpage>274</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27537532"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/SLA.0000000000001965</pub-id>
          <pub-id pub-id-type="medline">27537532</pub-id>
          <pub-id pub-id-type="pmcid">PMC5315678</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref46">
        <label>46</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Maclagan</surname>
              <given-names>LC</given-names>
            </name>
            <name name-style="western">
              <surname>Park</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Sanmartin</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Mathur</surname>
              <given-names>KR</given-names>
            </name>
            <name name-style="western">
              <surname>Roth</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Manuel</surname>
              <given-names>DG</given-names>
            </name>
            <name name-style="western">
              <surname>Gershon</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Booth</surname>
              <given-names>GL</given-names>
            </name>
            <name name-style="western">
              <surname>Bhatia</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Atzema</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Tu</surname>
              <given-names>JV</given-names>
            </name>
          </person-group>
          <article-title>The CANHEART health index: a tool for monitoring the cardiovascular health of the Canadian population</article-title>
          <source>CMAJ</source>
          <year>2014</year>
          <month>02</month>
          <day>18</day>
          <volume>186</volume>
          <issue>3</issue>
          <fpage>180</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://www.cmaj.ca/cgi/pmidlookup?view=long&#38;pmid=24366893"/>
          </comment>
          <pub-id pub-id-type="doi">10.1503/cmaj.131358</pub-id>
          <pub-id pub-id-type="medline">24366893</pub-id>
          <pub-id pub-id-type="pii">cmaj.131358</pub-id>
          <pub-id pub-id-type="pmcid">PMC3928209</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref47">
        <label>47</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Berry</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>O'Neill</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Sturrock</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Acharya</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Brankston</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Harish</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Kornas</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Maani</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Naganathan</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Obress</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Rossi</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Simmons</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Van Camp</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xie</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Tuite</surname>
              <given-names>AR</given-names>
            </name>
            <name name-style="western">
              <surname>Greer</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Fisman</surname>
              <given-names>DN</given-names>
            </name>
            <name name-style="western">
              <surname>Soucy</surname>
              <given-names>JP</given-names>
            </name>
          </person-group>
          <article-title>A sub-national real-time epidemiological and vaccination database for the COVID-19 pandemic in Canada</article-title>
          <source>Sci Data</source>
          <year>2021</year>
          <month>07</month>
          <day>15</day>
          <volume>8</volume>
          <issue>1</issue>
          <fpage>173</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41597-021-00955-2"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41597-021-00955-2</pub-id>
          <pub-id pub-id-type="medline">34267221</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41597-021-00955-2</pub-id>
          <pub-id pub-id-type="pmcid">PMC8282612</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref48">
        <label>48</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Marwitz</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Jones</surname>
              <given-names>SC</given-names>
            </name>
            <name name-style="western">
              <surname>Kortepeter</surname>
              <given-names>CM</given-names>
            </name>
            <name name-style="western">
              <surname>Dal Pan</surname>
              <given-names>GJ</given-names>
            </name>
            <name name-style="western">
              <surname>Muñoz</surname>
              <given-names>MA</given-names>
            </name>
          </person-group>
          <article-title>An evaluation of postmarketing reports with an outcome of death in the US FDA adverse event reporting system</article-title>
          <source>Drug Saf</source>
          <year>2020</year>
          <month>05</month>
          <volume>43</volume>
          <issue>5</issue>
          <fpage>457</fpage>
          <lpage>65</lpage>
          <pub-id pub-id-type="doi">10.1007/s40264-020-00908-5</pub-id>
          <pub-id pub-id-type="medline">31981082</pub-id>
          <pub-id pub-id-type="pii">10.1007/s40264-020-00908-5</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref49">
        <label>49</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Meddings</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reichert</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Smith</surname>
              <given-names>SN</given-names>
            </name>
            <name name-style="western">
              <surname>Iwashyna</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Langa</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Hofer</surname>
              <given-names>TP</given-names>
            </name>
            <name name-style="western">
              <surname>McMahon</surname>
              <given-names>LF</given-names>
            </name>
          </person-group>
          <article-title>The impact of disability and social determinants of health on condition-specific readmissions beyond medicare risk adjustments: a cohort study</article-title>
          <source>J Gen Intern Med</source>
          <year>2017</year>
          <month>01</month>
          <volume>32</volume>
          <issue>1</issue>
          <fpage>71</fpage>
          <lpage>80</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/27848189"/>
          </comment>
          <pub-id pub-id-type="doi">10.1007/s11606-016-3869-x</pub-id>
          <pub-id pub-id-type="medline">27848189</pub-id>
          <pub-id pub-id-type="pii">10.1007/s11606-016-3869-x</pub-id>
          <pub-id pub-id-type="pmcid">PMC5215164</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref50">
        <label>50</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III clinical database CareVue subset (version 1.4)</article-title>
          <source>PhysioNet</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://physionet.org/content/mimic3-carevue/1.4/">https://physionet.org/content/mimic3-carevue/1.4/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref51">
        <label>51</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>AE</given-names>
            </name>
            <name name-style="western">
              <surname>Pollard</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Shen</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Lehman</surname>
              <given-names>LW</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ghassemi</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Szolovits</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Celi</surname>
              <given-names>LA</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
          </person-group>
          <article-title>MIMIC-III, a freely accessible critical care database</article-title>
          <source>Sci Data</source>
          <year>2016</year>
          <month>05</month>
          <day>24</day>
          <volume>3</volume>
          <fpage>160035</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/sdata.2016.35"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id>
          <pub-id pub-id-type="medline">27219127</pub-id>
          <pub-id pub-id-type="pii">sdata201635</pub-id>
          <pub-id pub-id-type="pmcid">PMC4878278</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref52">
        <label>52</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goldberger</surname>
              <given-names>AL</given-names>
            </name>
            <name name-style="western">
              <surname>Amaral</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Glass</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Hausdorff</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Ivanov</surname>
              <given-names>PC</given-names>
            </name>
            <name name-style="western">
              <surname>Mark</surname>
              <given-names>RG</given-names>
            </name>
            <name name-style="western">
              <surname>Mietus</surname>
              <given-names>JE</given-names>
            </name>
            <name name-style="western">
              <surname>Moody</surname>
              <given-names>GB</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Stanley</surname>
              <given-names>HE</given-names>
            </name>
          </person-group>
          <article-title>PhysioBank, PhysioToolkit, and PhysioNet</article-title>
          <source>Circulation</source>
          <year>2000</year>
          <month>06</month>
          <day>13</day>
          <volume>101</volume>
          <issue>23</issue>
          <fpage>e215</fpage>
          <pub-id pub-id-type="doi">10.1161/01.CIR.101.23.e215</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref53">
        <label>53</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pishgar</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Theis</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Del Rios</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ardati</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Anahideh</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Darabi</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Prediction of unplanned 30-day readmission for ICU patients with heart failure</article-title>
          <source>BMC Med Inform Decis Mak</source>
          <year>2022</year>
          <month>05</month>
          <day>02</day>
          <volume>22</volume>
          <issue>1</issue>
          <fpage>117</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://bmcmedinformdecismak.biomedcentral.com/articles/10.1186/s12911-022-01857-y"/>
          </comment>
          <pub-id pub-id-type="doi">10.1186/s12911-022-01857-y</pub-id>
          <pub-id pub-id-type="medline">35501789</pub-id>
          <pub-id pub-id-type="pii">10.1186/s12911-022-01857-y</pub-id>
          <pub-id pub-id-type="pmcid">PMC9063206</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref54">
        <label>54</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Aliu</surname>
              <given-names>O</given-names>
            </name>
            <name name-style="western">
              <surname>Auger</surname>
              <given-names>KA</given-names>
            </name>
            <name name-style="western">
              <surname>Sun</surname>
              <given-names>GH</given-names>
            </name>
            <name name-style="western">
              <surname>Burke</surname>
              <given-names>JF</given-names>
            </name>
            <name name-style="western">
              <surname>Cooke</surname>
              <given-names>CR</given-names>
            </name>
            <name name-style="western">
              <surname>Chung</surname>
              <given-names>KC</given-names>
            </name>
            <name name-style="western">
              <surname>Hayward</surname>
              <given-names>RA</given-names>
            </name>
          </person-group>
          <article-title>The effect of pre-Affordable Care Act (ACA) medicaid eligibility expansion in New York State on access to specialty surgical care</article-title>
          <source>Med Care</source>
          <year>2014</year>
          <month>09</month>
          <volume>52</volume>
          <issue>9</issue>
          <fpage>790</fpage>
          <lpage>5</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/24984209"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/MLR.0000000000000175</pub-id>
          <pub-id pub-id-type="medline">24984209</pub-id>
          <pub-id pub-id-type="pmcid">PMC4262819</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref55">
        <label>55</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kahn</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Le</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Angus</surname>
              <given-names>DC</given-names>
            </name>
            <name name-style="western">
              <surname>Cox</surname>
              <given-names>CE</given-names>
            </name>
            <name name-style="western">
              <surname>Hough</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>White</surname>
              <given-names>DB</given-names>
            </name>
            <name name-style="western">
              <surname>Yende</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Carson</surname>
              <given-names>SS</given-names>
            </name>
            <collab>ProVent Study Group Investigators</collab>
          </person-group>
          <article-title>The epidemiology of chronic critical illness in the United States*</article-title>
          <source>Crit Care Med</source>
          <year>2015</year>
          <month>02</month>
          <volume>43</volume>
          <issue>2</issue>
          <fpage>282</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/25377018"/>
          </comment>
          <pub-id pub-id-type="doi">10.1097/CCM.0000000000000710</pub-id>
          <pub-id pub-id-type="medline">25377018</pub-id>
          <pub-id pub-id-type="pmcid">PMC7901538</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref56">
        <label>56</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sabbatini</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Kocher</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Basu</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Hsia</surname>
              <given-names>RY</given-names>
            </name>
          </person-group>
          <article-title>In-hospital outcomes and costs among patients hospitalized during a return visit to the emergency department</article-title>
          <source>JAMA</source>
          <year>2016</year>
          <month>02</month>
          <day>16</day>
          <volume>315</volume>
          <issue>7</issue>
          <fpage>663</fpage>
          <lpage>71</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/26881369"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jama.2016.0649</pub-id>
          <pub-id pub-id-type="medline">26881369</pub-id>
          <pub-id pub-id-type="pii">2491638</pub-id>
          <pub-id pub-id-type="pmcid">PMC8366576</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref57">
        <label>57</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grantham</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>COVID-19 survival calculator</article-title>
          <source>Nexoid</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.covid19survivalcalculator.com/">https://www.covid19survivalcalculator.com/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref58">
        <label>58</label>
        <nlm-citation citation-type="web">
          <article-title>Texas hospital inpatient discharge public use data file</article-title>
          <source>Texas Department of State Health Services</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.dshs.texas.gov/center-health-statistics/texas-health-care-information-collection/download-and-purchase-data/texas-inpatient-public-use-data-file-pudf">https://www.dshs.texas.gov/center-health-statistics/texas-health-care-information-collection/download-and-purchase-data/texas-inpatient-public-use-data-file-pudf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref59">
        <label>59</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Yu</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Machine learning methods for prediction of COVID-19 patient length of stay: using Texas PUDF data</article-title>
          <source>Proceedings of the 3rd International Conference on Electrical, Computer, Communications and Mechatronics Engineering</source>
          <year>2023</year>
          <conf-name>ICECCME '23</conf-name>
          <conf-date>July 19-21, 2023</conf-date>
          <conf-loc>Canary Islands, Spain</conf-loc>
          <fpage>1</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/10252792"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/ICECCME57830.2023.10252792</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref60">
        <label>60</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goss</surname>
              <given-names>LB</given-names>
            </name>
            <name name-style="western">
              <surname>Ortiz</surname>
              <given-names>JR</given-names>
            </name>
            <name name-style="western">
              <surname>Okamura</surname>
              <given-names>DM</given-names>
            </name>
            <name name-style="western">
              <surname>Hayward</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Goss</surname>
              <given-names>CH</given-names>
            </name>
          </person-group>
          <article-title>Significant reductions in mortality in hospitalized patients with systemic lupus erythematosus in Washington State from 2003 to 2011</article-title>
          <source>PLoS One</source>
          <year>2015</year>
          <volume>10</volume>
          <issue>6</issue>
          <fpage>e0128920</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0128920"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0128920</pub-id>
          <pub-id pub-id-type="medline">26087254</pub-id>
          <pub-id pub-id-type="pii">PONE-D-14-48928</pub-id>
          <pub-id pub-id-type="pmcid">PMC4473009</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref61">
        <label>61</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Metcalfe</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Zogg</surname>
              <given-names>CK</given-names>
            </name>
            <name name-style="western">
              <surname>Haut</surname>
              <given-names>ER</given-names>
            </name>
            <name name-style="western">
              <surname>Pawlik</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Haider</surname>
              <given-names>AH</given-names>
            </name>
            <name name-style="western">
              <surname>Perry</surname>
              <given-names>DC</given-names>
            </name>
          </person-group>
          <article-title>Data resource profile: state inpatient databases</article-title>
          <source>Int J Epidemiol</source>
          <year>2019</year>
          <month>12</month>
          <day>01</day>
          <volume>48</volume>
          <issue>6</issue>
          <fpage>1742</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31280297"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ije/dyz117</pub-id>
          <pub-id pub-id-type="medline">31280297</pub-id>
          <pub-id pub-id-type="pii">5529312</pub-id>
          <pub-id pub-id-type="pmcid">PMC6929527</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref62">
        <label>62</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Barrett</surname>
              <given-names>ML</given-names>
            </name>
            <name name-style="western">
              <surname>Wier</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>HJ</given-names>
            </name>
            <name name-style="western">
              <surname>Steiner</surname>
              <given-names>CA</given-names>
            </name>
          </person-group>
          <article-title>All-cause readmissions by payer and age, 2009–2013</article-title>
          <source>Healthcare Cost and Utilization Project (HCUP) Statistical Briefs</source>
          <access-date>2024-10-14</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ncbi.nlm.nih.gov/books/NBK343800/pdf/Bookshelf_NBK343800.pdf">https://www.ncbi.nlm.nih.gov/books/NBK343800/pdf/Bookshelf_NBK343800.pdf</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref63">
        <label>63</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Emam</surname>
              <given-names>KE</given-names>
            </name>
            <name name-style="western">
              <surname>Kababji</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Pilgram</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Cano</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>pysdg</article-title>
          <source>Open Science Framework</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://osf.io/xj9pr/">https://osf.io/xj9pr/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref64">
        <label>64</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hothorn</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Hornik</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Zeileis</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Unbiased recursive partitioning: a conditional inference framework</article-title>
          <source>J Comput Graph Stat</source>
          <year>2006</year>
          <month>09</month>
          <volume>15</volume>
          <issue>3</issue>
          <fpage>651</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.1198/106186006X133933</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref65">
        <label>65</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Read</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Pfahringer</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Holmes</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Frank</surname>
              <given-names>E</given-names>
            </name>
          </person-group>
          <article-title>Classifier chains for multi-label classification</article-title>
          <source>Proceedings of the 2009 Conference on Machine Learning and Knowledge Discovery in Databases</source>
          <year>2009</year>
          <conf-name>ECML PKDD '09</conf-name>
          <conf-date>September 7-11, 2009</conf-date>
          <conf-loc>Bled, Slovenia</conf-loc>
          <fpage>254</fpage>
          <lpage>69</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://link.springer.com/chapter/10.1007/978-3-642-04174-7_17"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref66">
        <label>66</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Drechsler</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Reiter</surname>
              <given-names>Jp</given-names>
            </name>
          </person-group>
          <article-title>An empirical evaluation of easily implemented, nonparametric methods for generating synthetic datasets</article-title>
          <source>Comput Stat Data Anal</source>
          <year>2011</year>
          <month>12</month>
          <volume>55</volume>
          <issue>12</issue>
          <fpage>3232</fpage>
          <lpage>43</lpage>
          <pub-id pub-id-type="doi">10.1016/j.csda.2011.06.006</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref67">
        <label>67</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Arslan</surname>
              <given-names>RC</given-names>
            </name>
            <name name-style="western">
              <surname>Schilling</surname>
              <given-names>KM</given-names>
            </name>
            <name name-style="western">
              <surname>Gerlach</surname>
              <given-names>TM</given-names>
            </name>
            <name name-style="western">
              <surname>Penke</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Using 26,000 diary entries to show ovulatory changes in sexual desire and behavior</article-title>
          <source>J Pers Soc Psychol</source>
          <year>2021</year>
          <month>08</month>
          <volume>121</volume>
          <issue>2</issue>
          <fpage>410</fpage>
          <lpage>31</lpage>
          <pub-id pub-id-type="doi">10.1037/pspp0000208</pub-id>
          <pub-id pub-id-type="medline">30148371</pub-id>
          <pub-id pub-id-type="pii">2018-41799-001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref68">
        <label>68</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bonnéry</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Feng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Henneberger</surname>
              <given-names>AK</given-names>
            </name>
            <name name-style="western">
              <surname>Johnson</surname>
              <given-names>TL</given-names>
            </name>
            <name name-style="western">
              <surname>Lachowicz</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Rose</surname>
              <given-names>BA</given-names>
            </name>
            <name name-style="western">
              <surname>Shaw</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Stapleton</surname>
              <given-names>LM</given-names>
            </name>
            <name name-style="western">
              <surname>Woolley</surname>
              <given-names>ME</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>The promise and limitations of synthetic data as a strategy to expand access to state-level multi-agency longitudinal data</article-title>
          <source>J Res Educ Eff</source>
          <year>2019</year>
          <month>08</month>
          <day>02</day>
          <volume>12</volume>
          <issue>4</issue>
          <fpage>616</fpage>
          <lpage>47</lpage>
          <pub-id pub-id-type="doi">10.1080/19345747.2019.1631421</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref69">
        <label>69</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sabay</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Harris</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Bejugama</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Jaceldo-Siegl</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Overcoming small data limitations in heart disease prediction by using surrogate data</article-title>
          <source>SMU Data Sci Rev</source>
          <year>2018</year>
          <volume>1</volume>
          <issue>3</issue>
          <fpage>2</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://scholar.smu.edu/cgi/viewcontent.cgi?article=1038&#38;context=datasciencereview#:~:text=Using%20the%20neural%20network%20model,maintaining%20stability%20at%201%20percent"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref70">
        <label>70</label>
        <nlm-citation citation-type="web">
          <article-title>Formal privacy and synthetic data for the American community survey</article-title>
          <source>US Census Bureau</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.census.gov/library/working-papers/2018/adrm/formal-privacy-synthetic-data-acs.html">https://www.census.gov/library/working-papers/2018/adrm/formal-privacy-synthetic-data-acs.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref71">
        <label>71</label>
        <nlm-citation citation-type="web">
          <article-title>Utility of synthetic microdata generated using tree-based methods</article-title>
          <source>United Nations Economic Commission for Europe</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://unece.org/statistics/events/SDC2015">https://unece.org/statistics/events/SDC2015</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref72">
        <label>72</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Practical data synthesis for large samples</article-title>
          <source>J Priv Confid</source>
          <year>2016</year>
          <volume>7</volume>
          <issue>3</issue>
          <fpage>67</fpage>
          <lpage>97</lpage>
          <pub-id pub-id-type="doi">10.29012/jpc.v7i3.407</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref73">
        <label>73</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nowok</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Raab</surname>
              <given-names>GM</given-names>
            </name>
            <name name-style="western">
              <surname>Dibben</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Providing bespoke synthetic data for the UK Longitudinal Studies and other sensitive data with the synthpop package for R1</article-title>
          <source>Stat J IAOS</source>
          <year>2017</year>
          <month>08</month>
          <day>21</day>
          <volume>33</volume>
          <issue>3</issue>
          <fpage>785</fpage>
          <lpage>96</lpage>
          <pub-id pub-id-type="doi">10.3233/SJI-150153</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref74">
        <label>74</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Quintana</surname>
              <given-names>DS</given-names>
            </name>
          </person-group>
          <article-title>A synthetic dataset primer for the biobehavioural sciences to promote reproducibility and hypothesis generation</article-title>
          <source>Elife</source>
          <year>2020</year>
          <month>03</month>
          <day>11</day>
          <volume>9</volume>
          <fpage>e53275</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32159513"/>
          </comment>
          <pub-id pub-id-type="doi">10.7554/eLife.53275</pub-id>
          <pub-id pub-id-type="medline">32159513</pub-id>
          <pub-id pub-id-type="pii">53275</pub-id>
          <pub-id pub-id-type="pmcid">PMC7112950</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref75">
        <label>75</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kaur</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Sobiesk</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Patil</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bhagat</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Markuzon</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Application of Bayesian networks to generate synthetic health data</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2021</year>
          <month>03</month>
          <day>18</day>
          <volume>28</volume>
          <issue>4</issue>
          <fpage>801</fpage>
          <lpage>11</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33367620"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa303</pub-id>
          <pub-id pub-id-type="medline">33367620</pub-id>
          <pub-id pub-id-type="pii">6046159</pub-id>
          <pub-id pub-id-type="pmcid">PMC7973486</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref76">
        <label>76</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Murphy</surname>
              <given-names>KP</given-names>
            </name>
          </person-group>
          <source>Machine Learning: A Probabilistic Perspective</source>
          <year>2012</year>
          <publisher-loc>New York, NY</publisher-loc>
          <publisher-name>MIT Press</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref77">
        <label>77</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Qian</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Cebere</surname>
              <given-names>BC</given-names>
            </name>
            <name name-style="western">
              <surname>van der Schaar</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Synthcity: facilitating innovative use cases of synthetic data in different data modalities</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 18, 2023.</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2301.07573"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref78">
        <label>78</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Goodfellow</surname>
              <given-names>IJ</given-names>
            </name>
            <name name-style="western">
              <surname>Pouget-Abadie</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Mirza</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Warde-Farley</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Ozair</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Courville</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Bengio</surname>
              <given-names>Y</given-names>
            </name>
          </person-group>
          <article-title>Generative adversarial networks</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 10, 2014</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1406.2661"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref79">
        <label>79</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bourou</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>El Saer</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Velivassaki</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Voulkidis</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Zahariadis</surname>
              <given-names>T</given-names>
            </name>
          </person-group>
          <article-title>A review of tabular data synthesis using GANs on an IDS dataset</article-title>
          <source>Information</source>
          <year>2021</year>
          <month>09</month>
          <day>14</day>
          <volume>12</volume>
          <issue>9</issue>
          <fpage>375</fpage>
          <pub-id pub-id-type="doi">10.3390/info12090375</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref80">
        <label>80</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kingma</surname>
              <given-names>DP</given-names>
            </name>
            <name name-style="western">
              <surname>Welling</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Auto-encoding variational bayes</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on December 20, 2013</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1312.6114"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref81">
        <label>81</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wan</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Zhang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>Variational autoencoder based synthetic data generation for imbalanced learning</article-title>
          <source>Proceedings of the 2017 IEEE Symposium Series on Computational Intelligence</source>
          <year>2017</year>
          <conf-name>SSCI '17</conf-name>
          <conf-date>November 27-December 1, 2017</conf-date>
          <conf-loc>Honolulu, HI</conf-loc>
          <fpage>1</fpage>
          <lpage>7</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/8285168"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/SSCI.2017.8285168</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref82">
        <label>82</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ishfaq</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Hoogi</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Rubin</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>TVAE: triplet-based variational autoencoder using metric learning</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on february 13, 2018</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1802.04403"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref83">
        <label>83</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Sohn</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Lee</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Yan</surname>
              <given-names>X</given-names>
            </name>
          </person-group>
          <article-title>Learning structured output representation using deep conditional generative models</article-title>
          <source>Proceedings of the 29th International Conference on Neural Information Processing Systems</source>
          <year>2015</year>
          <conf-name>NIPS '15</conf-name>
          <conf-date>December 7-12, 2015</conf-date>
          <conf-loc>Montreal, QC</conf-loc>
          <fpage>5</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper/2015/hash/8d55a249e6baa5c06772297520da2051-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref84">
        <label>84</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Salim Jr</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Synthetic patient generation: a deep learning approach using variational autoencoders</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on August 20, 2018</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1808.06444"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref85">
        <label>85</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Akrami</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Joshi</surname>
              <given-names>AA</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Aydöre</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Leahy</surname>
              <given-names>RM</given-names>
            </name>
          </person-group>
          <article-title>A robust variational autoencoder using beta divergence</article-title>
          <source>Knowl Based Syst</source>
          <year>2022</year>
          <month>02</month>
          <day>28</day>
          <volume>238</volume>
          <fpage>107886</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36714396"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.knosys.2021.107886</pub-id>
          <pub-id pub-id-type="medline">36714396</pub-id>
          <pub-id pub-id-type="pii">107886</pub-id>
          <pub-id pub-id-type="pmcid">PMC9881733</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref86">
        <label>86</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Snoek</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Larochelle</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>Adams</surname>
              <given-names>RP</given-names>
            </name>
          </person-group>
          <article-title>Practical Bayesian optimization of machine learning algorithms</article-title>
          <source>Proceedings of the 25th International Conference on Neural Information Processing Systems</source>
          <year>2012</year>
          <conf-name>NIPS ’12</conf-name>
          <conf-date>December 3-6, 2012</conf-date>
          <conf-loc>Lake Tahoe, NV</conf-loc>
          <fpage>2951</fpage>
          <lpage>9</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://papers.nips.cc/paper_files/paper/2012/hash/05311655a15b75fab86956663e1819cd-Abstract.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref87">
        <label>87</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bartz</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Bartz-Beielstein</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Zaefferer</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Mersmann</surname>
              <given-names>O</given-names>
            </name>
          </person-group>
          <source>Hyperparameter Tuning for Machine and Deep Learning with R: A Practical Guide</source>
          <year>2023</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer</publisher-name>
        </nlm-citation>
      </ref>
      <ref id="ref88">
        <label>88</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bischl</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Binder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Lang</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pielok</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Richter</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Coors</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
          </person-group>
          <article-title>Hyperparameter optimization: foundations, algorithms, best practices and open challenges</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on July 13, 2021</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2107.05847v3"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref89">
        <label>89</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Binder</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pfisterer</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Bischl</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Collecting empirical data about hyperparameters for data driven AutoML</article-title>
          <source>Proceedings of the 7th ICML Workshop on Automated Machine Learning</source>
          <year>2020</year>
          <conf-name>AutoML '20</conf-name>
          <conf-date>July 17-18, 2020</conf-date>
          <conf-loc>Virtual Event</conf-loc>
          <fpage>1</fpage>
          <lpage>12</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.automl.org/wp-content/uploads/2020/07/AutoML_2020_paper_63.pdf"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref90">
        <label>90</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kühn</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Probst</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Thomas</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Bischl</surname>
              <given-names>B</given-names>
            </name>
          </person-group>
          <article-title>Automatic exploration of machine learning experiments on OpenML</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on june 28, 2018</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1806.10961v3"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref91">
        <label>91</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Juwara</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>El-Hussuna</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>An evaluation of synthetic data augmentation for mitigating covariate bias in health data</article-title>
          <source>Patterns (N Y)</source>
          <year>2024</year>
          <month>04</month>
          <day>12</day>
          <volume>5</volume>
          <issue>4</issue>
          <fpage>100946</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S2666-3899(24)00045-X"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.patter.2024.100946</pub-id>
          <pub-id pub-id-type="medline">38645766</pub-id>
          <pub-id pub-id-type="pii">S2666-3899(24)00045-X</pub-id>
          <pub-id pub-id-type="pmcid">PMC11026977</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref92">
        <label>92</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Huang</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Li</surname>
              <given-names>W</given-names>
            </name>
            <name name-style="western">
              <surname>Macheret</surname>
              <given-names>F</given-names>
            </name>
            <name name-style="western">
              <surname>Gabriel</surname>
              <given-names>RA</given-names>
            </name>
            <name name-style="western">
              <surname>Ohno-Machado</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>A tutorial on calibration measurements and calibration models for clinical prediction models</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2020</year>
          <month>04</month>
          <day>01</day>
          <volume>27</volume>
          <issue>4</issue>
          <fpage>621</fpage>
          <lpage>33</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/32106284"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocz228</pub-id>
          <pub-id pub-id-type="medline">32106284</pub-id>
          <pub-id pub-id-type="pii">5762806</pub-id>
          <pub-id pub-id-type="pmcid">PMC7075534</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref93">
        <label>93</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kull</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Filho</surname>
              <given-names>TS</given-names>
            </name>
            <name name-style="western">
              <surname>Flach</surname>
              <given-names>P</given-names>
            </name>
          </person-group>
          <article-title>Beta calibration: a well-founded and easily implemented improvement on logistic calibration for binary classifiers</article-title>
          <source>Proceedings of the 20th International Conference on Artificial Intelligence and Statistics</source>
          <year>2017</year>
          <conf-name>PMLR '17</conf-name>
          <conf-date>April 20-22, 2017</conf-date>
          <conf-loc>Fort Lauderdale, FL</conf-loc>
          <fpage>623</fpage>
          <lpage>31</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.mlr.press/v54/kull17a.html"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref94">
        <label>94</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>sdgm package</article-title>
          <source>Open Science Framework</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://osf.io/DCJM6/">https://osf.io/DCJM6/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref95">
        <label>95</label>
        <nlm-citation citation-type="web">
          <article-title>TensorFlow for R - reference</article-title>
          <source>R Studio</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://tensorflow.rstudio.com/reference/">https://tensorflow.rstudio.com/reference/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref96">
        <label>96</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ruíz</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>López</surname>
              <given-names>OA</given-names>
            </name>
            <name name-style="western">
              <surname>Ramírez</surname>
              <given-names>GH</given-names>
            </name>
            <name name-style="western">
              <surname>Hiriart</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Ruíz</surname>
              <given-names>JS</given-names>
            </name>
            <name name-style="western">
              <surname>López</surname>
              <given-names>OA</given-names>
            </name>
            <name name-style="western">
              <surname>Ramírez</surname>
              <given-names>GH</given-names>
            </name>
            <name name-style="western">
              <surname>Hiriart</surname>
              <given-names>JC</given-names>
            </name>
          </person-group>
          <article-title>Generalized linear mixed models for proportions and percentages</article-title>
          <source>Generalized Linear Mixed Models with Applications in Agriculture and Biology</source>
          <year>2023</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>209</fpage>
          <lpage>78</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref97">
        <label>97</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bates</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mächler</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Bolker</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Walker</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Fitting linear mixed-effects models using lme4</article-title>
          <source>J Stat Softw</source>
          <year>2015</year>
          <volume>67</volume>
          <issue>1</issue>
          <fpage>22</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jstatsoft.org/article/view/v067i01"/>
          </comment>
          <pub-id pub-id-type="doi">10.18637/jss.v067.i01</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref98">
        <label>98</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kuznetsova</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Brockhoff</surname>
              <given-names>PB</given-names>
            </name>
            <name name-style="western">
              <surname>Christensen</surname>
              <given-names>RH</given-names>
            </name>
          </person-group>
          <article-title>lmerTest package: tests in linear mixed effects models</article-title>
          <source>J Stat Softw</source>
          <year>2017</year>
          <volume>82</volume>
          <issue>13</issue>
          <fpage>22</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.jstatsoft.org/article/view/v082i13"/>
          </comment>
          <pub-id pub-id-type="doi">10.18637/jss.v082.i13</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref99">
        <label>99</label>
        <nlm-citation citation-type="web">
          <article-title>MuMIn: multi-model inference</article-title>
          <source>Cran R</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://cran.r-project.org/web/packages/MuMIn/index.html">https://cran.r-project.org/web/packages/MuMIn/index.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref100">
        <label>100</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Cano</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>Analysis of data complexity measures for classification</article-title>
          <source>Expert Syst Appl</source>
          <year>2013</year>
          <month>09</month>
          <volume>40</volume>
          <issue>12</issue>
          <fpage>4820</fpage>
          <lpage>31</lpage>
          <pub-id pub-id-type="doi">10.1016/j.eswa.2013.02.025</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref101">
        <label>101</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Lorena</surname>
              <given-names>AC</given-names>
            </name>
            <name name-style="western">
              <surname>Garcia</surname>
              <given-names>LP</given-names>
            </name>
            <name name-style="western">
              <surname>Lehmann</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Souto</surname>
              <given-names>MC</given-names>
            </name>
            <name name-style="western">
              <surname>Ho</surname>
              <given-names>TK</given-names>
            </name>
          </person-group>
          <article-title>How complex is your classification problem?</article-title>
          <source>ACM Comput Surv</source>
          <year>2019</year>
          <month>09</month>
          <day>13</day>
          <volume>52</volume>
          <issue>5</issue>
          <fpage>1</fpage>
          <lpage>34</lpage>
          <pub-id pub-id-type="doi">10.1145/3347711</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref102">
        <label>102</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Zheng</surname>
              <given-names>C</given-names>
            </name>
          </person-group>
          <article-title>Optimizing the synthesis of clinical trial data using sequential trees</article-title>
          <source>J Am Med Inform Assoc</source>
          <year>2021</year>
          <month>01</month>
          <day>15</day>
          <volume>28</volume>
          <issue>1</issue>
          <fpage>3</fpage>
          <lpage>13</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/33186440"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/jamia/ocaa249</pub-id>
          <pub-id pub-id-type="medline">33186440</pub-id>
          <pub-id pub-id-type="pii">5981525</pub-id>
          <pub-id pub-id-type="pmcid">PMC7810457</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref103">
        <label>103</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Ankan</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Panda</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>pgmpy: probabilistic graphical models using Python</article-title>
          <source>Proceedings of the 14th Python in Science Conference</source>
          <year>2015</year>
          <conf-name>SciPy '15</conf-name>
          <conf-date>July 6-12, 2015</conf-date>
          <conf-loc>Austin, TX</conf-loc>
          <fpage>11</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://proceedings.scipy.org/articles/Majora-7b98e3ed-001.pdf"/>
          </comment>
          <pub-id pub-id-type="doi">10.25080/majora-7b98e3ed-001</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref104">
        <label>104</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Xu</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Skoularidou</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Cuesta-Infante</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Veeramachaneni</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Modeling tabular data using conditional GAN</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on July 1, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1907.00503"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref105">
        <label>105</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Watson</surname>
              <given-names>DS</given-names>
            </name>
            <name name-style="western">
              <surname>Blesch</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Kapar</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Wright</surname>
              <given-names>MN</given-names>
            </name>
          </person-group>
          <article-title>Adversarial random forests for density estimation and generative modeling</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on May 19, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2205.09435"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref106">
        <label>106</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Durkan</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Bekasov</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Murray</surname>
              <given-names>I</given-names>
            </name>
            <name name-style="western">
              <surname>Papamakarios</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Neural spline flows</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on June 10, 2019</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1906.04032"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref107">
        <label>107</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Liu</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Kababji</surname>
              <given-names>SE</given-names>
            </name>
            <name name-style="western">
              <surname>Mitsakakis</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Pilgram</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Walters</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Clemons</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Pond</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>El-Hussuna</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Emam</surname>
              <given-names>KL</given-names>
            </name>
          </person-group>
          <article-title>Synthetic data generation for augmenting small samples</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on January 30, 2025</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2501.18741"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref108">
        <label>108</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Wickham</surname>
              <given-names>H</given-names>
            </name>
            <name name-style="western">
              <surname>François</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Henry</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Müller</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Vaughan</surname>
              <given-names>D</given-names>
            </name>
          </person-group>
          <article-title>dplyr: a grammar of data manipulation</article-title>
          <source>dplyr</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dplyr.tidyverse.org/">https://dplyr.tidyverse.org/</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref109">
        <label>109</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Hyland</surname>
              <given-names>SL</given-names>
            </name>
            <name name-style="western">
              <surname>Esteban</surname>
              <given-names>C</given-names>
            </name>
            <name name-style="western">
              <surname>Rätsch</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Real-valued (medical) time series generation with recurrent conditional GANs</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on June 8, 2017</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/1706.02633"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref110">
        <label>110</label>
        <nlm-citation citation-type="confproc">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Kushwaha</surname>
              <given-names>PK</given-names>
            </name>
            <name name-style="western">
              <surname>Kumaresan</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Machine learning algorithm in healthcare system: a review</article-title>
          <source>Proceedings of the 2021 International Conference on Technological Advancements and Innovations</source>
          <year>2021</year>
          <conf-name>ICTAI '21</conf-name>
          <conf-date>November 10-12, 2021</conf-date>
          <conf-loc>Tashkent, Uzbekistan</conf-loc>
          <fpage>478</fpage>
          <lpage>81</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ieeexplore.ieee.org/document/9673220"/>
          </comment>
          <pub-id pub-id-type="doi">10.1109/ictai53825.2021.9673220</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref111">
        <label>111</label>
        <nlm-citation citation-type="book">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Gupta</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Sedamkar</surname>
              <given-names>RR</given-names>
            </name>
          </person-group>
          <person-group person-group-type="editor">
            <name name-style="western">
              <surname>Jain</surname>
              <given-names>V</given-names>
            </name>
            <name name-style="western">
              <surname>Chatterjee</surname>
              <given-names>JM</given-names>
            </name>
          </person-group>
          <article-title>Machine learning for healthcare: introduction</article-title>
          <source>Machine Learning with Health Care Perspective: Machine Learning and Healthcare</source>
          <year>2020</year>
          <publisher-loc>Cham, Switzerland</publisher-loc>
          <publisher-name>Springer</publisher-name>
          <fpage>1</fpage>
          <lpage>25</lpage>
        </nlm-citation>
      </ref>
      <ref id="ref112">
        <label>112</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Andaur Navarro</surname>
              <given-names>CL</given-names>
            </name>
            <name name-style="western">
              <surname>Damen</surname>
              <given-names>JA</given-names>
            </name>
            <name name-style="western">
              <surname>van Smeden</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Takada</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Nijman</surname>
              <given-names>SW</given-names>
            </name>
            <name name-style="western">
              <surname>Dhiman</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Ma</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Bajpai</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Riley</surname>
              <given-names>RD</given-names>
            </name>
            <name name-style="western">
              <surname>Moons</surname>
              <given-names>KG</given-names>
            </name>
            <name name-style="western">
              <surname>Hooft</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Systematic review identifies the design and methodological conduct of studies on machine learning-based prediction models</article-title>
          <source>J Clin Epidemiol</source>
          <year>2023</year>
          <month>02</month>
          <volume>154</volume>
          <fpage>8</fpage>
          <lpage>22</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://linkinghub.elsevier.com/retrieve/pii/S0895-4356(22)00300-6"/>
          </comment>
          <pub-id pub-id-type="doi">10.1016/j.jclinepi.2022.11.015</pub-id>
          <pub-id pub-id-type="medline">36436815</pub-id>
          <pub-id pub-id-type="pii">S0895-4356(22)00300-6</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref113">
        <label>113</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Rousset</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Dellamonica</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Menuet</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Lira Pineda</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Sabatine</surname>
              <given-names>MS</given-names>
            </name>
            <name name-style="western">
              <surname>Giugliano</surname>
              <given-names>RP</given-names>
            </name>
            <name name-style="western">
              <surname>Trichelair</surname>
              <given-names>P</given-names>
            </name>
            <name name-style="western">
              <surname>Zaslavskiy</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Ricci</surname>
              <given-names>L</given-names>
            </name>
          </person-group>
          <article-title>Can machine learning bring cardiovascular risk assessment to the next level? A methodological study using FOURIER trial data</article-title>
          <source>Eur Heart J Digit Health</source>
          <year>2022</year>
          <month>03</month>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>38</fpage>
          <lpage>48</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/36713994"/>
          </comment>
          <pub-id pub-id-type="doi">10.1093/ehjdh/ztab093</pub-id>
          <pub-id pub-id-type="medline">36713994</pub-id>
          <pub-id pub-id-type="pii">ztab093</pub-id>
          <pub-id pub-id-type="pmcid">PMC9707897</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref114">
        <label>114</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>SF</given-names>
            </name>
            <name name-style="western">
              <surname>Reps</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Kai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Garibaldi</surname>
              <given-names>JM</given-names>
            </name>
            <name name-style="western">
              <surname>Qureshi</surname>
              <given-names>N</given-names>
            </name>
          </person-group>
          <article-title>Can machine-learning improve cardiovascular risk prediction using routine clinical data?</article-title>
          <source>PLoS One</source>
          <year>2017</year>
          <volume>12</volume>
          <issue>4</issue>
          <fpage>e0174944</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://dx.plos.org/10.1371/journal.pone.0174944"/>
          </comment>
          <pub-id pub-id-type="doi">10.1371/journal.pone.0174944</pub-id>
          <pub-id pub-id-type="medline">28376093</pub-id>
          <pub-id pub-id-type="pii">PONE-D-16-49429</pub-id>
          <pub-id pub-id-type="pmcid">PMC5380334</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref115">
        <label>115</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Akyea</surname>
              <given-names>RK</given-names>
            </name>
            <name name-style="western">
              <surname>Qureshi</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Kai</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Weng</surname>
              <given-names>SF</given-names>
            </name>
          </person-group>
          <article-title>Performance and clinical utility of supervised machine-learning approaches in detecting familial hypercholesterolaemia in primary care</article-title>
          <source>NPJ Digit Med</source>
          <year>2020</year>
          <month>10</month>
          <day>30</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>142</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41746-020-00349-5"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41746-020-00349-5</pub-id>
          <pub-id pub-id-type="medline">33145438</pub-id>
          <pub-id pub-id-type="pii">349</pub-id>
          <pub-id pub-id-type="pmcid">PMC7603302</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref116">
        <label>116</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Desai</surname>
              <given-names>RJ</given-names>
            </name>
            <name name-style="western">
              <surname>Wang</surname>
              <given-names>SV</given-names>
            </name>
            <name name-style="western">
              <surname>Vaduganathan</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Evers</surname>
              <given-names>T</given-names>
            </name>
            <name name-style="western">
              <surname>Schneeweiss</surname>
              <given-names>S</given-names>
            </name>
          </person-group>
          <article-title>Comparison of machine learning methods with traditional models for use of administrative claims with electronic medical records to predict heart failure outcomes</article-title>
          <source>JAMA Netw Open</source>
          <year>2020</year>
          <month>01</month>
          <day>03</day>
          <volume>3</volume>
          <issue>1</issue>
          <fpage>e1918962</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/31922560"/>
          </comment>
          <pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.18962</pub-id>
          <pub-id pub-id-type="medline">31922560</pub-id>
          <pub-id pub-id-type="pii">2758475</pub-id>
          <pub-id pub-id-type="pmcid">PMC6991258</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref117">
        <label>117</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Li</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Jiang</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>He</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Jia</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Peng</surname>
              <given-names>Y</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>M</given-names>
            </name>
          </person-group>
          <article-title>Machine learning to predict the 1-year mortality rate after acute anterior myocardial infarction in Chinese patients</article-title>
          <source>Ther Clin Risk Manag</source>
          <year>2020</year>
          <month>01</month>
          <volume>16</volume>
          <fpage>1</fpage>
          <lpage>6</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.tandfonline.com/doi/10.2147/TCRM.S236498?url_ver=Z39.88-2003&#38;rfr_id=ori:rid:crossref.org&#38;rfr_dat=cr_pub  0pubmed"/>
          </comment>
          <pub-id pub-id-type="doi">10.2147/TCRM.S236498</pub-id>
          <pub-id pub-id-type="medline">32021220</pub-id>
          <pub-id pub-id-type="pii">236498</pub-id>
          <pub-id pub-id-type="pmcid">PMC6957091</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref118">
        <label>118</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Shwartz-Ziv</surname>
              <given-names>R</given-names>
            </name>
            <name name-style="western">
              <surname>Armon</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>Tabular data: deep learning is not all you need</article-title>
          <source>Inf Fusion</source>
          <year>2022</year>
          <month>05</month>
          <volume>81</volume>
          <fpage>84</fpage>
          <lpage>90</lpage>
          <pub-id pub-id-type="doi">10.1016/j.inffus.2021.11.011</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref119">
        <label>119</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Grinsztajn</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Oyallon</surname>
              <given-names>E</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
          </person-group>
          <article-title>Why do tree-based models still outperform deep learning on typical tabular data?</article-title>
          <source>arxiv</source>
          <comment>Preprint posted online on July 18, 2022</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2207.08815"/>
          </comment>
          <pub-id pub-id-type="doi">10.48550/arXiv.2207.08815</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref120">
        <label>120</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Van Calster</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Collins</surname>
              <given-names>GS</given-names>
            </name>
            <name name-style="western">
              <surname>Vickers</surname>
              <given-names>AJ</given-names>
            </name>
            <name name-style="western">
              <surname>Wynants</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Kerr</surname>
              <given-names>KF</given-names>
            </name>
            <name name-style="western">
              <surname>Barreñada</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Varoquaux</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Singh</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Moons</surname>
              <given-names>KG</given-names>
            </name>
          </person-group>
          <article-title>Performance evaluation of predictive AI models to support medical decisions: Overview and guidance</article-title>
          <source>arXiv</source>
          <comment>Preprint posted online on December 13, 2024</comment>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://arxiv.org/abs/2412.10288v1"/>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref121">
        <label>121</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Bradshaw</surname>
              <given-names>TJ</given-names>
            </name>
            <name name-style="western">
              <surname>Huemann</surname>
              <given-names>Z</given-names>
            </name>
            <name name-style="western">
              <surname>Hu</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Rahmim</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>A guide to cross-validation for artificial intelligence in medical imaging</article-title>
          <source>Radiol Artif Intell</source>
          <year>2023</year>
          <month>07</month>
          <day>01</day>
          <volume>5</volume>
          <issue>4</issue>
          <fpage>e220232</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://europepmc.org/abstract/MED/37529208"/>
          </comment>
          <pub-id pub-id-type="doi">10.1148/ryai.220232</pub-id>
          <pub-id pub-id-type="medline">37529208</pub-id>
          <pub-id pub-id-type="pmcid">PMC10388213</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref122">
        <label>122</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Nakagawa</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Schielzeth</surname>
              <given-names>H</given-names>
            </name>
          </person-group>
          <article-title>A general and simple method for obtaining R2 from generalized linear mixed-effects models</article-title>
          <source>Methods Ecol Evol</source>
          <year>2012</year>
          <month>12</month>
          <day>03</day>
          <volume>4</volume>
          <issue>2</issue>
          <fpage>133</fpage>
          <lpage>42</lpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="http://paperpile.com/b/97OSAd/6wZL"/>
          </comment>
          <pub-id pub-id-type="doi">10.1111/j.2041-210x.2012.00261.x</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref123">
        <label>123</label>
        <nlm-citation citation-type="web">
          <article-title>Tri-council policy statement: ethical conduct for research involving humans – TCPS 2 (2022)</article-title>
          <source>Government of Canada</source>
          <year>2018</year>
          <access-date>2020-05-09</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://ethics.gc.ca/eng/tcps2-eptc2_2018_chapter3-chapitre3.html">https://ethics.gc.ca/eng/tcps2-eptc2_2018_chapter3-chapitre3.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref124">
        <label>124</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Dixon</surname>
              <given-names>JR</given-names>
            </name>
          </person-group>
          <article-title>The international conference on harmonization good clinical practice guideline</article-title>
          <source>Qual Assur</source>
          <year>1998</year>
          <month>11</month>
          <day>30</day>
          <volume>6</volume>
          <issue>2</issue>
          <fpage>65</fpage>
          <lpage>74</lpage>
          <pub-id pub-id-type="doi">10.1080/105294199277860</pub-id>
          <pub-id pub-id-type="medline">10386329</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref125">
        <label>125</label>
        <nlm-citation citation-type="web">
          <article-title>Guidance document: part C, division 5 of the food and drug regulations “drugs for clinical trials involving human subjects” (GUI-0100) - summary</article-title>
          <source>Government of Canada</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.canada.ca/en/health-canada/services/drugs-health-products/compliance-enforcement/good-clinical-practices/guidance-documents/guidance-drugs-clinical-trials-human-subjects-gui-0100.html">https://tinyurl.com/46hxf54t</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref126">
        <label>126</label>
        <nlm-citation citation-type="web">
          <article-title>Natural health products regulations SOR/2003-196</article-title>
          <source>Government of Canada</source>
          <year>2025</year>
          <access-date>2025-05-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://laws-lois.justice.gc.ca/eng/regulations/SOR-2003-196/FullText.html">https://laws-lois.justice.gc.ca/eng/regulations/SOR-2003-196/FullText.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref127">
        <label>127</label>
        <nlm-citation citation-type="web">
          <article-title>Medical devices regulations SOR/98-282</article-title>
          <source>Government of Canada</source>
          <year>2025</year>
          <access-date>2025-05-31</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://laws-lois.justice.gc.ca/eng/regulations/SOR-98-282/section-68.11.html">https://laws-lois.justice.gc.ca/eng/regulations/SOR-98-282/section-68.11.html</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref128">
        <label>128</label>
        <nlm-citation citation-type="web">
          <article-title>Personal health information protection act, 2004, S.O. 2004, c. 3, Sched. A</article-title>
          <source>Government of Ontario</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://www.ontario.ca/laws/statute/04p03">https://www.ontario.ca/laws/statute/04p03</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref129">
        <label>129</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>van Hoorn</surname>
              <given-names>R</given-names>
            </name>
          </person-group>
          <article-title>On the acceptance, adoption, and utility of synthetic data for healthcare innovation</article-title>
          <source>Eindhoven University of Technology</source>
          <access-date>2024-12-21</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://research.tue.nl/en/studentTheses/on-the-acceptance-adoption-and-utility-of-synthetic-data-for-heal">https://research.tue.nl/en/studentTheses/on-the-acceptance-adoption-and-utility-of-synthetic-data-for-heal</ext-link>
          </comment>
        </nlm-citation>
      </ref>
      <ref id="ref130">
        <label>130</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>El-Hussuna</surname>
              <given-names>A</given-names>
            </name>
          </person-group>
          <article-title>An evaluation of the replicability of analyses using synthetic health data</article-title>
          <source>Sci Rep</source>
          <year>2024</year>
          <month>03</month>
          <day>24</day>
          <volume>14</volume>
          <issue>1</issue>
          <fpage>6978</fpage>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.1038/s41598-024-57207-7"/>
          </comment>
          <pub-id pub-id-type="doi">10.1038/s41598-024-57207-7</pub-id>
          <pub-id pub-id-type="medline">38521806</pub-id>
          <pub-id pub-id-type="pii">10.1038/s41598-024-57207-7</pub-id>
          <pub-id pub-id-type="pmcid">PMC10960851</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref131">
        <label>131</label>
        <nlm-citation citation-type="journal">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>El Kababji</surname>
              <given-names>S</given-names>
            </name>
            <name name-style="western">
              <surname>Mitsakakis</surname>
              <given-names>N</given-names>
            </name>
            <name name-style="western">
              <surname>Fang</surname>
              <given-names>X</given-names>
            </name>
            <name name-style="western">
              <surname>Beltran-Bless</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Pond</surname>
              <given-names>G</given-names>
            </name>
            <name name-style="western">
              <surname>Vandermeer</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Radhakrishnan</surname>
              <given-names>D</given-names>
            </name>
            <name name-style="western">
              <surname>Mosquera</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Paterson</surname>
              <given-names>A</given-names>
            </name>
            <name name-style="western">
              <surname>Shepherd</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>Chen</surname>
              <given-names>B</given-names>
            </name>
            <name name-style="western">
              <surname>Barlow</surname>
              <given-names>WE</given-names>
            </name>
            <name name-style="western">
              <surname>Gralow</surname>
              <given-names>J</given-names>
            </name>
            <name name-style="western">
              <surname>Savard</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>Clemons</surname>
              <given-names>M</given-names>
            </name>
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Evaluating the utility and privacy of synthetic breast cancer clinical trial data sets</article-title>
          <source>JCO Clin Cancer Inform</source>
          <year>2023</year>
          <month>09</month>
          <issue>7</issue>
          <fpage>e2300116</fpage>
          <pub-id pub-id-type="doi">10.1200/cci.23.00116</pub-id>
        </nlm-citation>
      </ref>
      <ref id="ref132">
        <label>132</label>
        <nlm-citation citation-type="web">
          <person-group person-group-type="author">
            <name name-style="western">
              <surname>Pilgram</surname>
              <given-names>L</given-names>
            </name>
            <name name-style="western">
              <surname>El Emam</surname>
              <given-names>K</given-names>
            </name>
          </person-group>
          <article-title>Hallucinations in tabular synthetic data</article-title>
          <source>Open Science Framework</source>
          <access-date>2025-05-29</access-date>
          <comment>
            <ext-link ext-link-type="uri" xlink:type="simple" xlink:href="https://doi.org/10.17605/OSF.IO/DQSAB">https://doi.org/10.17605/OSF.IO/DQSAB</ext-link>
          </comment>
        </nlm-citation>
      </ref>
    </ref-list>
  </back>
</article>
