<?xml version="1.0" encoding="UTF-8"?><!DOCTYPE article PUBLIC "-//NLM//DTD Journal Publishing DTD v2.0 20040830//EN" "journalpublishing.dtd"><article xmlns:mml="http://www.w3.org/1998/Math/MathML" xmlns:xlink="http://www.w3.org/1999/xlink" dtd-version="2.0" xml:lang="en" article-type="research-article"><front><journal-meta><journal-id journal-id-type="nlm-ta">J Med Internet Res</journal-id><journal-id journal-id-type="publisher-id">jmir</journal-id><journal-id journal-id-type="index">1</journal-id><journal-title>Journal of Medical Internet Research</journal-title><abbrev-journal-title>J Med Internet Res</abbrev-journal-title><issn pub-type="epub">1438-8871</issn><publisher><publisher-name>JMIR Publications</publisher-name><publisher-loc>Toronto, Canada</publisher-loc></publisher></journal-meta><article-meta><article-id pub-id-type="publisher-id">v28i1e88678</article-id><article-id pub-id-type="doi">10.2196/88678</article-id><article-categories><subj-group subj-group-type="heading"><subject>Original Paper</subject></subj-group></article-categories><title-group><article-title>An Evaluation of Pretrained Generative Models for Augmenting Small Health Data: Comparative Modeling Study</article-title></title-group><contrib-group><contrib contrib-type="author"><name name-style="western"><surname>Huet-Dastarac</surname><given-names>Margerie</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Dankar</surname><given-names>Fida K</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Liu</surname><given-names>Dan</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>El Kababji</surname><given-names>Samer</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib><contrib contrib-type="author"><name name-style="western"><surname>Pilgram</surname><given-names>Lisa</given-names></name><degrees>MD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref><xref ref-type="aff" rid="aff3">3</xref></contrib><contrib contrib-type="author" corresp="yes"><name name-style="western"><surname>El Emam</surname><given-names>Khaled</given-names></name><degrees>PhD</degrees><xref ref-type="aff" rid="aff1">1</xref><xref ref-type="aff" rid="aff2">2</xref></contrib></contrib-group><aff id="aff1"><institution>School of Epidemiology and Public Health, Faculty of Medicine, University of Ottawa</institution><addr-line>451 Smyth Rd</addr-line><addr-line>Ottawa</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff2"><institution>Research Institute, Children's Hospital of Eastern Ontario</institution><addr-line>Ottawa</addr-line><addr-line>ON</addr-line><country>Canada</country></aff><aff id="aff3"><institution>Department of Nephrology and Medical Intensive Care, Charit&#x00E9; - Universitaetsmedizin Berlin</institution><addr-line>Berlin</addr-line><country>Germany</country></aff><contrib-group><contrib contrib-type="editor"><name name-style="western"><surname>Sarvestan</surname><given-names>Javad</given-names></name></contrib></contrib-group><contrib-group><contrib contrib-type="reviewer"><name name-style="western"><surname>Joodi</surname><given-names>Erfan</given-names></name></contrib><contrib contrib-type="reviewer"><name name-style="western"><surname>Wang</surname><given-names>Yiqing</given-names></name></contrib></contrib-group><author-notes><corresp>Correspondence to Khaled El Emam, PhD, School of Epidemiology and Public Health, Faculty of Medicine, University of Ottawa, 451 Smyth Rd, Ottawa, ON, K1H 8M5, Canada, 1 613-562-5800; <email>kelemam@ehealthinformation.ca</email></corresp></author-notes><pub-date pub-type="collection"><year>2026</year></pub-date><pub-date pub-type="epub"><day>15</day><month>6</month><year>2026</year></pub-date><volume>28</volume><elocation-id>e88678</elocation-id><history><date date-type="received"><day>29</day><month>11</month><year>2025</year></date><date date-type="rev-recd"><day>14</day><month>05</month><year>2026</year></date><date date-type="accepted"><day>14</day><month>05</month><year>2026</year></date></history><copyright-statement>&#x00A9; Margerie Huet-Dastarac, Fida Dankar, Dan Liu, Samer El Kababji, Lisa Pilgram, Khaled El Emam. Originally published in the Journal of Medical Internet Research (<ext-link ext-link-type="uri" xlink:href="https://www.jmir.org">https://www.jmir.org</ext-link>), 15.6.2026. </copyright-statement><copyright-year>2026</copyright-year><license license-type="open-access" xlink:href="https://creativecommons.org/licenses/by/4.0/"><p>This is an open-access article distributed under the terms of the Creative Commons Attribution License (<ext-link ext-link-type="uri" xlink:href="https://creativecommons.org/licenses/by/4.0/">https://creativecommons.org/licenses/by/4.0/</ext-link>), which permits unrestricted use, distribution, and reproduction in any medium, provided the original work, first published in the Journal of Medical Internet Research (ISSN 1438-8871), is properly cited. The complete bibliographic information, a link to the original publication on <ext-link ext-link-type="uri" xlink:href="https://www.jmir.org/">https://www.jmir.org/</ext-link>, as well as this copyright and license information must be included.</p></license><self-uri xlink:type="simple" xlink:href="https://www.jmir.org/2026/1/e88678"/><abstract><sec><title>Background</title><p>Synthetic data generation (SDG) has emerged as a promising solution to address data scarcity in health care, where privacy concerns, regulatory barriers, and the high cost of data acquisition limit access to real patient datasets. Machine learning models in this domain often operate in low-data regimes, with training set sizes as low as 20 and a median dataset size of around 600 records&#x2014;conditions that hinder model generalization and increase the risks of overfitting and bias. SDG addresses these challenges by producing artificial samples that mimic real-world patient data, enabling robust and privacy-preserving model development.</p></sec><sec><title>Objective</title><p>This study was a comprehensive assessment of SDG-augmented training across a wide array of models&#x2014;both pretrained and non-pretrained&#x2014;for outcome prediction in 13 health care datasets. For small datasets of sizes 50 and 350 records, we answer 3 key questions: (1) Do pretrained SDG models generate more effective augmentations than their non-pretrained counterparts for small datasets? (2) Is augmentation beneficial for both pretrained and non-pretrained classifiers for small datasets? (3) Among 3 state-of-the-art classification models, which offers the best predictive performance on small datasets? The workload that this study aimed to improve was binary classification.</p></sec><sec sec-type="methods"><title>Methods</title><p>The 3 classifiers considered were light gradient boosting trees, large language models (LLMs) adapted to tabular data, and Tabular Prior-Data Fitted Network (TabPFN), a transformer-based method that has become the new state of the art in terms of tabular data classification. Each classifier was augmented through different SDG methods: current state-of-the-art techniques (Bayesian networks, conditional tabular generative adversarial networks, tabular variational autoencoders, and sequential trees) and the use of LLMs for tabular data generation.</p></sec><sec sec-type="results"><title>Results</title><p>Augmented TabPFN demonstrated superior performance, yielding significantly higher area under the curve and integrated calibration index scores compared to other classifiers. Post hoc analysis revealed that, for the dataset sizes examined, SDG and LLM models exhibited overfitting tendencies. Notably, simple dataset augmentation through sampling with replacement achieved performance comparable to that of SDG-based and LLM-based augmentation methods for TabPFN, suggesting that gains were primarily driven by increased sample size rather than SDG.</p></sec><sec sec-type="conclusions"><title>Conclusions</title><p>Given its strong performance and minimal computational overhead, we recommend augmenting TabPFN through sampling with replacement as the optimal approach for small-data binary classification tasks. This method achieves performance comparable to that of more complex SDG techniques while offering substantial computational advantages.</p></sec></abstract><kwd-group><kwd>binary classification</kwd><kwd>machine learning</kwd><kwd>data augmentation</kwd><kwd>synthetic data generation</kwd><kwd>tabular data</kwd><kwd>small data regime</kwd></kwd-group></article-meta></front><body><sec id="s1" sec-type="intro"><title>Introduction</title><sec id="s1-1"><title>Background and Study Objectives</title><p>Machine learning predictive modeling applications in health care often suffer from limited access to real patient data due to privacy concerns, regulatory constraints, and the high cost of data acquisition. Recent reviews have identified that most machine learning studies rely on training models on datasets of insufficient sizes [<xref ref-type="bibr" rid="ref1">1</xref>-<xref ref-type="bibr" rid="ref5">5</xref>]. This shortage in data availability&#x2014;referred to as a low-data regime&#x2014;introduces challenges such as overfitting [<xref ref-type="bibr" rid="ref1">1</xref>,<xref ref-type="bibr" rid="ref6">6</xref>], biased learning, and reduced model robustness [<xref ref-type="bibr" rid="ref2">2</xref>].</p><p>Synthetic data generation (SDG) has been proposed as a potential solution by augmenting training datasets with artificially generated samples that closely mirror real patient data. Increasing the size of the training dataset is generally associated with improved predictive performance in machine learning models [<xref ref-type="bibr" rid="ref7">7</xref>]. However, it remains unclear whether the benefits of augmentation arise from the fidelity of synthetic samples or simply from the increased sample size. In addition, augmentation can be interpreted as a form of regularization, where synthetic examples increase the diversity of the training data by generating additional variations from the same underlying population [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>A key question is, therefore, whether pretrained SDG models, when used for prediction and augmentation, can perform better than non-pretrained models and improve the performance of predictive clinical classification tasks. This study presents a large-scale evaluation of the impact of data augmentation by SDG on both pretrained and non-pretrained prediction models. We consider 2 realistic low-data regime scenarios for health datasets: 50 and 350 records. This definition of a low-data regime is consistent with current practices, where the median size of the training datasets used in clinical prediction tasks can be as low as 20, with a median value around 600 records [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref9">9</xref>].</p><p>We structured our study to answer the following specific questions in the case of a low-data regime:</p><list list-type="bullet"><list-item><p>Q1. Is data augmentation using pretrained SDG models outperforming data augmented by non-pretrained SDG models?</p></list-item><list-item><p>Q2. What is the effect of augmentation on pretrained and non-pretrained classification models?</p></list-item><list-item><p>Q3. What is the best clinical predictive classification model among gradient-boosted trees, large language models (LLMs), and Tabular Prior-Data Fitted Network (TabPFN)?</p></list-item></list><p>We challenge the assumption that augmentation through SDG is necessary for improving clinical prediction in low-data regimes by systematically comparing SDG, LLM-based augmentation, and resampling methods. We demonstrate that the gains are driven primarily by increased sample size, with simple sampling with replacement achieving performance comparable to or exceeding that of complex generative approaches.</p></sec><sec id="s1-2"><title>Previous Work</title><p>For nontabular data, data augmentation has been applied to address the low-data regime problem, such as in imaging, video, and natural language processing data [<xref ref-type="bibr" rid="ref10">10</xref>-<xref ref-type="bibr" rid="ref15">15</xref>], and it has been shown to be a viable solution to address the problem of incomplete and unbalanced time series datasets [<xref ref-type="bibr" rid="ref16">16</xref>-<xref ref-type="bibr" rid="ref19">19</xref>]. However, there has been limited work on the evaluation of data augmentation in the context of tabular health data for clinical predictive workloads.</p><p>Commonly used SDG models are conditional tabular generative adversarial networks (CTGANs) [<xref ref-type="bibr" rid="ref20">20</xref>], tabular variational autoencoders (TVAEs) [<xref ref-type="bibr" rid="ref21">21</xref>], Bayesian normalization methods [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>], and sequential decision trees [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. However, classification models are not the only ones affected by the small dataset size available. In fact, the SDG models themselves may experience overfitting when trained on small datasets, raising questions about the quality of synthetic samples under the low-data regime.</p><p>Recent work has highlighted the potential of pretrained transformer models, such as LLMs, for tasks involving tabular data, specifically SDG and classification [<xref ref-type="bibr" rid="ref30">30</xref>,<xref ref-type="bibr" rid="ref31">31</xref>]. LLMs have been adapted from their original textual domain to tabular data through methods that account for dataset-specific properties, such as column-order and row invariance. Fine-tuning such models enables their application not only to classification but also to SDG [<xref ref-type="bibr" rid="ref32">32</xref>]. Several methods&#x2014;including Curated LLM (CLLM) [<xref ref-type="bibr" rid="ref33">33</xref>], LLMOverTab [<xref ref-type="bibr" rid="ref34">34</xref>], and Pred-LLM [<xref ref-type="bibr" rid="ref35">35</xref>]&#x2014;explicitly leverage LLM pretraining on large datasets for tabular data generation in low-data regimes.</p><p>However, recent work has noted that LLMs may not yet perform classification at a level comparable to traditional machine learning models, underscoring the importance of systematically evaluating LLMs in clinical contexts [<xref ref-type="bibr" rid="ref36">36</xref>]. Recent models such as the TabPFN were designed for classification and represent a transformer architecture pretrained on synthetic tabular data. While TabPFN demonstrates promising performance on datasets ranging from 650 to 10,000 records [<xref ref-type="bibr" rid="ref37">37</xref>], this size range fails to address the reality of clinical prediction tasks, where datasets commonly contain fewer than 600 records [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. This gap is particularly significant given that many clinical prediction tasks must operate in low-data regimes due to data privacy constraints and the rarity of certain conditions.</p></sec></sec><sec id="s2" sec-type="methods"><title>Methods</title><p>The workload that this study aimed to improve was predictive binary classification.</p><sec id="s2-1"><title>Ethical Considerations</title><p>This project was approved by the Research Ethics Board of the Children&#x2019;s Hospital of Eastern Ontario Research Institute, protocol 24/80x. Because the datasets used in this study were deidentified, obtaining participant consent was waived by the Research Ethics Board of the Children&#x2019;s Hospital of Eastern Ontario Research Institute. This project adhered to the Declaration of Helsinki.</p></sec><sec id="s2-2"><title>Models Evaluated</title><p>The pretrained and non-pretrained models used in this study are summarized in <xref ref-type="table" rid="table1">Table 1</xref>. We use the term <italic>non-pretrained</italic> to denote models trained directly from scratch on the available data, in contrast to LLMs and TabPFN, which rely on extensive pretraining on real and synthetic data.</p><table-wrap id="t1" position="float"><label>Table 1.</label><caption><p>Overview of the models evaluated in this study.</p></caption><table id="table1" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Purpose</td><td align="left" valign="bottom">Non-pretrained models</td><td align="left" valign="bottom">Pretrained models</td></tr></thead><tbody><tr><td align="left" valign="top">Classification model</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>LGBM<sup><xref ref-type="table-fn" rid="table1fn1">a</xref></sup></p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>DistilGPT2</p></list-item><list-item><p>Llama 1B</p></list-item><list-item><p>Llama 8B fine-tuned on UltraMedical dataset</p></list-item><list-item><p>TabPFN<sup><xref ref-type="table-fn" rid="table1fn2">b</xref></sup> version 2</p></list-item></list></td></tr><tr><td align="left" valign="top">Synthetic data generation (generative models)</td><td align="left" valign="top"><list list-type="bullet"><list-item><p>Sequential decision trees</p></list-item><list-item><p>Bayesian network</p></list-item><list-item><p>CTGAN<sup><xref ref-type="table-fn" rid="table1fn3">c</xref></sup></p></list-item><list-item><p>TVAE<sup><xref ref-type="table-fn" rid="table1fn4">d</xref></sup></p></list-item></list></td><td align="left" valign="top"><list list-type="bullet"><list-item><p>DistilGPT2</p></list-item><list-item><p>Llama 1B</p></list-item><list-item><p>Llama 8B fine-tuned on UltraMedical dataset</p></list-item></list></td></tr></tbody></table><table-wrap-foot><fn id="table1fn1"><p><sup>a</sup>LGBM: light gradient boosting machine.</p></fn><fn id="table1fn2"><p><sup>b</sup>TabPFN: Tabular Prior-Data Fitted Network.</p></fn><fn id="table1fn3"><p><sup>c</sup>CTGAN: conditional tabular generative adversarial network.</p></fn><fn id="table1fn4"><p><sup>d</sup>TVAE: tabular variational autoencoder.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-3"><title>Non-Pretrained Generative Models</title><p>We used 4 commonly applied generative modeling methods to generate new observations for structured tabular data. CTGAN is a conditional generative adversarial network specifically adapted for tabular data, which captures complex feature distributions through adversarial training [<xref ref-type="bibr" rid="ref38">38</xref>-<xref ref-type="bibr" rid="ref40">40</xref>]. TVAE uses variational autoencoding to model the joint distribution of tabular features, enabling flexible data synthesis [<xref ref-type="bibr" rid="ref21">21</xref>,<xref ref-type="bibr" rid="ref41">41</xref>,<xref ref-type="bibr" rid="ref42">42</xref>]. Bayesian networks represent probabilistic relationships between variables through directed acyclic graphs, allowing for the generation of synthetic data consistent with estimated dependencies [<xref ref-type="bibr" rid="ref22">22</xref>-<xref ref-type="bibr" rid="ref25">25</xref>]. Sequential trees generate synthetic data by recursively partitioning the feature space in a manner similar to decision trees, ensuring that complex conditional dependencies are preserved [<xref ref-type="bibr" rid="ref26">26</xref>-<xref ref-type="bibr" rid="ref29">29</xref>]. All 4 approaches have been widely adopted in recent work on tabular data synthesis.</p><p>Categorical and continuous features were identified based on dataset metadata. Continuous variables were normalized using training-set statistics, and categorical variables were one-hot encoded where required for the modeling task. Missing values were handled using the default mechanisms of each model (eg, a missing categorical value was treated as a valid category in the modeling). These are described in the documentation or the implementation of the generative models. Hyperparameters followed standard recommended settings as described in the original implementations. Synthetic samples were generated by unconditional sampling from the fitted models and then inverse-transformed back to the original feature space.</p><p>Sequential synthesis was implemented using Aetion Generate, a commercial product from Aetion, and the last 3 methods were implemented using an open-source Python package, Synthcity [<xref ref-type="bibr" rid="ref43">43</xref>]. The <italic>pysdg</italic> library [<xref ref-type="bibr" rid="ref44">44</xref>], our publicly available adaptation of Synthcity, provides further preprocessing and postprocessing on top of Synthcity.</p></sec><sec id="s2-4"><title>Pretrained Generative Models</title><p>Fine-tuning an LLM involves adapting a pretrained model to a specific task or domain by training it on a smaller, task-specific dataset such as processing tabular data instead of free text. Fine-tuning leverages the general knowledge already encoded in the model from pretraining on vast amounts of data, allowing the model to specialize without requiring training from scratch. During fine-tuning, the model&#x2019;s parameters are updated to align its outputs with the desired behavior. We used the low-rank adaptation (LoRA) approach [<xref ref-type="bibr" rid="ref45">45</xref>], which is commonly used for efficient fine-tuning.</p><p>LLMs are pretrained on large-scale text corpora and are therefore designed to process textual input and generate textual output. Several strategies have recently been proposed to adapt LLMs for tabular learning tasks. In this study, we used the PredLLM framework, which reformulates each tabular record into a natural language sentence following the pattern &#x201C;<italic>column name is value,...</italic>&#x201D; for fine-tuning the LLM. To prevent the model from exploiting column order as information, the input columns were randomly shuffled during training. Finally, the target variable was consistently placed at the end of the sequence to ensure that predictions incorporated information from all other features. The variables of a record are therefore generated column by column, completing with the outcome variable.</p><p>Relying on fine-tuning on serialized tabular training data, the LLMs were used for SDG without an explicit system prompt. Each synthetic record was generated by conditioning the model on a randomly selected feature-value pair sampled from the empirical distribution of the training data. Given this partial input, the model autoregressively completed the remaining features in a fixed predefined order learned during fine-tuning. Generated outputs were parsed back into tabular form using the known schema. More details are provided in Section B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><p>As described in Table B2 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>, we used 3 LLMs of various sizes: DistilGPT2 and Llama 1B, pretrained on general data, and Llama 8B, specifically pretrained on the UltraMedical dataset [<xref ref-type="bibr" rid="ref46">46</xref>], which contains more than 400,000 samples of synthetic and manually curated biomedical instructions.</p></sec><sec id="s2-5"><title>Non-Pretrained Classification Models</title><p>In this study, the chosen classification non-pretrained model was a light gradient boosting machine (LGBM) [<xref ref-type="bibr" rid="ref47">47</xref>]. Tree-based models are the most common type of machine learning prediction methods used in clinical research [<xref ref-type="bibr" rid="ref3">3</xref>]. They perform better than linear models, such as logistic regression [<xref ref-type="bibr" rid="ref48">48</xref>-<xref ref-type="bibr" rid="ref52">52</xref>], and they were found to perform better than deep learning models on tabular datasets [<xref ref-type="bibr" rid="ref53">53</xref>,<xref ref-type="bibr" rid="ref54">54</xref>].</p><p>Model tuning used 5-fold cross-validation and Bayesian optimization [<xref ref-type="bibr" rid="ref55">55</xref>]. The range for the tuning parameters was previously suggested [<xref ref-type="bibr" rid="ref56">56</xref>-<xref ref-type="bibr" rid="ref59">59</xref>], and these are summarized in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. High-cardinality variables were converted to embeddings [<xref ref-type="bibr" rid="ref60">60</xref>] using a scheme similar to target encoding.</p></sec><sec id="s2-6"><title>Pretrained Classification Models</title><p>The same pretrained models used for the generation of data were also used for classification. Their application to classification tasks can be seen as only the last step of the generation process, where the outcome variable is generated for a record based on all previous variables.</p><p>Another model, TabPFN, is a transformer-based model designed to perform classification and regression tasks on tabular data. It is trained on a large corpus of synthetic classification tasks with known Bayesian-optimal solutions, from which the model learns to approximate posterior class probabilities directly from features and labels without requiring further training on new tasks. This allows it to generalize effectively across diverse tabular datasets and make predictions quickly, particularly excelling in low-data regimes. Unlike traditional machine learning models that rely on iterative training and hyperparameter tuning, TabPFN offers fast, zero-shot inference through an in-context learning mechanism.</p></sec><sec id="s2-7"><title>Research Questions</title><sec id="s2-7-1"><title>RQ1: Is Data Augmentation Generated by Pretrained SDG Models Outperforming Data Augmented by Non-Pretrained SDG Models?</title><p>We trained 2 classifiers with augmented datasets: LGBM and TabPFN. The synthetic samples of the augmented data were generated by the 3 pretrained SDG approaches based on different LLMs and 4 non-pretrained models.</p><p>The 2 classifiers were trained for binary classification tasks. We assessed the downstream utility by reporting the area under the curve (AUC) score, integrated calibration index (ICI), and the corresponding n&#x2032; (the number of synthetic samples).</p><p>We performed 1-tailed paired permutation tests on both the AUC and ICI metrics across the datasets to comprehensively evaluate whether pretrained SDG models outperform non-pretrained SDG models. A 1-tailed permutation test was chosen because our hypothesis was directional&#x2014;namely, that pretrained SDG models would outperform non-pretrained SDG models under the small data regime by leveraging knowledge from their pretraining.</p><p>We selected the models yielding the highest AUC and the ones yielding the lowest ICI in each category (pretrained and non-pretrained) for each dataset and performed the permutation tests. Testing AUC allows us to determine if one model demonstrates significantly better discrimination&#x2014;the ability to correctly rank outcomes&#x2014;whereas testing ICI evaluates whether one model provides more accurate probability estimates through improved calibration.</p></sec><sec id="s2-7-2"><title>RQ2: What Is the Effect of Augmentation on Pretrained and Non-Pretrained Classification Models?</title><p>This question entails a comparison of whether to use data augmentation or not for pretrained and non-pretrained classifiers. We considered TabPFN as a pretrained classifier and LGBM as a non-pretrained classifier. We used the most beneficial data augmentation method, selected through the analysis answering the first question of this study (RQ1), and compared the classification results to the no augmentation baselines.</p><p>These 2 classifiers were trained on the same prediction tasks as RQ1, and the same downstream utility metrics were reported. We performed 1-tailed paired permutation statistical tests. One-tailed permutation tests were chosen because our hypothesis was directional&#x2014;namely, that augmentation would improve classification performance for models under the small data regime.</p></sec><sec id="s2-7-3"><title>RQ3: What Is the Best Clinical Prediction Classification Model Among LGBM, LLMs, and TabPFN?</title><p>To answer this final question, we compared the performance of each of the considered classifiers&#x2014;LGBM, fine-tuned LLMs, and TabPFN&#x2014;each in their best-performing augmentation configuration, as determined by answers to questions RQ1 and RQ2. Six 1-tailed permutation tests were performed for each low dataset regime: 3 on AUC metric and 3 on ICI&#x2014;LGBM versus LLM, TabPFN versus LGBM, and TabPFN versus LLM.</p><p>For each dataset and the 2 low-data regimes, we reported in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> the best-performing methods in terms of aggregated AUC, corresponding ICI, and n&#x2032;.</p></sec></sec><sec id="s2-8"><title>Study Design</title><p>To address our research questions, we designed a comprehensive evaluation procedure. We summarize the main tasks below and expand on some of the key ones after that.</p><sec id="s2-8-1"><title>Evaluation Scope</title><p>We evaluated 3 types of binary classifiers (LGBM; LLMs&#x2014;DistilGPT2, Llama 1B, and Llama 8B; and TabPFN) on binary classification tasks with different augmentation strategies. An important limitation was that LLM classifiers were only evaluated without augmentation due to computational constraints. Fine-tuning LLMs with different quantities of synthetic samples would require 360 days using 1 NVIDIA-RTX-A6000 GPU 48 GB RAM (NVIDIA Corporation). This computational intensity makes such extensive fine-tuning impractical for the intended end users. Therefore, LLMs were used only as baseline classifiers and as SDG models. A detailed analysis of computational requirements is provided in Section D in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p></sec><sec id="s2-8-2"><title>Dataset Preparation</title><p>We used 13 clinical datasets (summarized in <xref ref-type="table" rid="table2">Table 2</xref> and further detailed in Section E in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). For each dataset, we simulated 2 low-data regimes by randomly sampling subsets of n<sub>0</sub>=50 and n<sub>0</sub>=350 records. We created hold-out validation and test sets of 10,000 records each from the remaining data, which are fixed to evaluate the different models fairly. We used stratified sampling for these sets to keep the original prevalence of target classes.</p><table-wrap id="t2" position="float"><label>Table 2.</label><caption><p>Summary of the 13 large real-world datasets from which 50 and 350 records are sampled, training synthetic data generation and classification models, and 10,000 records are sampled as test sets.</p></caption><table id="table2" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Dataset name</td><td align="left" valign="bottom">Description</td><td align="left" valign="bottom">Number of variables</td></tr></thead><tbody><tr><td align="left" valign="top">COVID (COVID-19)</td><td align="left" valign="top">A dataset that covers COVID-19 health records of Canadians collected by Esri Canada</td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">Canadian Community Health Survey (CCHS)</td><td align="left" valign="top">A pooled version of survey data across multiple years that gathers health information for the Canadian population</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">COVID Survival (Nexoid)</td><td align="left" valign="top">A secondary web-based survey dataset concerning COVID-19 survival prediction collected by the Nexoid company in London, UK</td><td align="left" valign="top">19</td></tr><tr><td align="left" valign="top">FDA Adverse Event Reporting System (FAERS)</td><td align="left" valign="top">A database that contains adverse events and medication error reports submitted to the FDA<sup><xref ref-type="table-fn" rid="table2fn1">a</xref></sup></td><td align="left" valign="top">7</td></tr><tr><td align="left" valign="top">Texas Inpatient Data (Texas)</td><td align="left" valign="top">A dataset on discharges from Texas hospitals</td><td align="left" valign="top">11</td></tr><tr><td align="left" valign="top">Washington State Hospital Discharge (Washington)</td><td align="left" valign="top">A dataset that collects the hospital discharge information from the HCUP<sup><xref ref-type="table-fn" rid="table2fn2">b</xref></sup> state inpatient database for 2007</td><td align="left" valign="top">8</td></tr><tr><td align="left" valign="top">Basic Stand Alone Inpatient Claims (BSA)</td><td align="left" valign="top">A dataset that contains the claim-level information from 2008 Medicare inpatient claims</td><td align="left" valign="top">6</td></tr><tr><td align="left" valign="top">Washington State Hospital Discharge (Washington 2008)</td><td align="left" valign="top">A dataset that collects the hospital discharge information from the HCUP state inpatient database for 2008</td><td align="left" valign="top">18</td></tr><tr><td align="left" valign="top">California Hospital Discharge (California)</td><td align="left" valign="top">A dataset that collects the hospital discharge information from the HCUP state inpatient database for 2007</td><td align="left" valign="top">16</td></tr><tr><td align="left" valign="top">Florida Hospital Discharge (Florida)</td><td align="left" valign="top">A dataset that collects the hospital discharge information from the HCUP state inpatient database for 2007</td><td align="left" valign="top">12</td></tr><tr><td align="left" valign="top">New York Hospital Discharge (New York)</td><td align="left" valign="top">A dataset that collects the hospital discharge information from the HCUP state inpatient database for 2007</td><td align="left" valign="top">14</td></tr><tr><td align="left" valign="top">Medical Information Mart for Intensive Care III (MIMIC-III)</td><td align="left" valign="top">A dataset that comprises deidentified health data associated with intensive care unit admissions</td><td align="left" valign="top">13</td></tr><tr><td align="left" valign="top">Better Outcomes Registry &#x0026; Network (BORN)</td><td align="left" valign="top">A dataset that collects data about pregnancy, birth, and childhood in the province</td><td align="left" valign="top">20</td></tr></tbody></table><table-wrap-foot><fn id="table2fn1"><p><sup>a</sup>FDA: Food and Drug Administration.</p></fn><fn id="table2fn2"><p><sup>b</sup>HCUP: Healthcare Cost and Utilization Project.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s2-8-3"><title>Synthetic Data Generation</title><p>We used 7 SDG models: 3 pretrained models (DistilGPT2, Llama 1B, and Llama 8B) and 4 non-pretrained models (sequential decision trees, Bayesian network, CTGAN, and TVAE). For each SDG model and each dataset, we generated synthetic samples in varying quantities (n&#x2032;) ranging from 5 to 10,000 records, following a geometric series (details provided in the Augmentation Scheme section). For each n&#x2032;, we generated 5 synthetic datasets to account for model stochasticity and averaged the performance results across the augmented datasets.</p></sec><sec id="s2-8-4"><title>Data Augmentation</title><p>For each combination of real data subset (n<sub>0</sub>) and synthetic data (n&#x2032;), we created augmented datasets by concatenating the real and synthetic data. This process was repeated for all SDG models and both low-data regimes (n<sub>0</sub>=50 and n<sub>0</sub>=350).</p></sec><sec id="s2-8-5"><title>Model Training and Evaluation</title><p>For each classifier (except LLMs), we trained models on original data only (no augmentation), augmented data from pretrained SDG models, and augmented data from non-pretrained SDG models.</p><p>We evaluated each model&#x2019;s performance using the hold-out validation set, calculating AUC and ICI. For augmented datasets, we averaged the performance across the 5 synthetic datasets for each n&#x2032;. Therefore, for each n&#x2032; value, we trained 5 models.</p></sec><sec id="s2-8-6"><title>Optimal Augmentation Selection</title><p>For each dataset, classifier, and SDG model combination, we identified the optimal n&#x2032; that yielded the best AUC on the validation set. We recorded the corresponding AUC, ICI, and n&#x2032; for further analysis.</p></sec><sec id="s2-8-7"><title>Statistical Analysis</title><p>To answer RQ1, we performed 1-tailed paired permutation tests comparing the best-performing pretrained and non-pretrained SDG models for both AUC and ICI metrics. For RQ2, we conducted 1-tailed paired permutation tests to compare augmented versus nonaugmented performance for LGBM and TabPFN for both AUC and ICI metrics. To address RQ3, we performed six 1-tailed permutation tests (3 each for AUC and ICI) comparing LGBM, LLMs, and TabPFN in their best-performing configurations.</p></sec><sec id="s2-8-8"><title>Reporting Results</title><p>We summarized the results in tables and figures, showing the performance metrics (AUC and ICI) and optimal n&#x2032; for each combination of classifier, augmentation strategy, and dataset. We reported the outcomes of all statistical tests, indicating the magnitude and significance of the differences between methods.</p><p>This overall evaluation design allows us to systematically assess the effectiveness of data augmentation techniques, compare pretrained and non-pretrained models, and identify the best-performing clinical prediction classification models under low-data regimes, while acknowledging and accounting for practical computational constraints.</p></sec></sec><sec id="s2-9"><title>Augmentation Scheme</title><p>Augmented data represents the concatenation of the original data (size n<sub>0</sub>) and the synthetic data (size n&#x2032;). To assess performance under varying low-data regimes, we simulate 2 levels of data scarcity by randomly sampling a subset of n<sub>0</sub>=50 and n<sub>0</sub>=350 records from each of the 13 datasets (see <xref ref-type="table" rid="table2">Table 2</xref> for a summary). For each dataset, we identify the optimal number of synthetic samples (n&#x2032;) and determine the best-performing SDG method. Unlike prior studies, which often fix an arbitrary number of synthetic samples across datasets, our approach adapts the number of augmented examples based on dataset-specific performance criteria.</p><p>To determine the optimal n&#x2032;, we selected an augmentation scheme that samples finely at the low end and coarsely at the high end of the range. Ten geometric series were created and provided n&#x2032; values varying from 5 to 10,000 records. The sizes of these synthetic datasets follow a geometric series defined by n&#x2032; = [<italic>b</italic><sup>(</sup><italic><sup>i</sup></italic> <sup>+ 4)</sup>], where <italic>b</italic>~N(1.5, 0.005) and <italic>i</italic>=1,...,25. This results in multiple augmented datasets of size n = n<sub>0</sub> + n&#x2032; for each base dataset. For each n&#x2032;, 5 synthetic datasets were generated and used to augment the real data and train classifiers. To reduce the impact of generative model stochasticity, we computed the average performance across these 5 augmented datasets. The same hold-out validation real datasets of 10,000 records served to determine the n&#x2032;, yielding the best AUC scores, and we used the same hold-out test datasets of 10,000 records for model performance evaluation and comparison (<xref ref-type="fig" rid="figure1">Figure 1</xref>).</p><fig position="float" id="figure1"><label>Figure 1.</label><caption><p>Augmentation scheme. AUC: area under the curve; ICI: integrated calibration index; SDG: synthetic data generation.</p></caption><graphic alt-version="no" mimetype="image" position="float" xlink:type="simple" xlink:href="jmir_v28i1e88678_fig01.png"/></fig></sec><sec id="s2-10"><title>Datasets</title><p><xref ref-type="table" rid="table2">Table 2</xref> summarizes the 13 health datasets used in the study. These datasets cover heterogeneous domains, including public health, hospital discharge, infant and maternal health, adverse events, intensive care unit, population health surveys, and insurance claims. The table provides an overview of the datasets and the number of variables included in the binary classification models used to predict the outcome. A detailed description of each preprocessed dataset and the binary workload used for modeling can be found in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>. The number of predictor variables in the workloads is consistent with what is seen in the clinical prediction literature [<xref ref-type="bibr" rid="ref3">3</xref>].</p></sec></sec><sec id="s3" sec-type="results"><title>Results</title><p>We report the results for each research question. Full details of the statistical analysis are provided in Section F in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>.</p><sec id="s3-1"><title>Q1: Impact of Pretrained vs Non-Pretrained Augmentation</title><p>We compared pretrained and non-pretrained SDG methods across the 13 datasets using 1-tailed paired permutation tests (<xref ref-type="table" rid="table3">Table 3</xref>).</p><table-wrap id="t3" position="float"><label>Table 3.</label><caption><p>Pairwise permutation tests to determine the impact of pretrained augmentation against non-pretrained augmentation. <italic>P</italic> values were adjusted for multiple comparisons using the Holm-Bonferroni procedure.</p></caption><table id="table3" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Null hypothesis</td><td align="left" valign="bottom">Alternative</td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table3fn1">a</xref></sup>-based score</td><td align="left" valign="bottom" colspan="2">ICI<sup><xref ref-type="table-fn" rid="table3fn2">b</xref></sup>-based score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Mean difference</td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td><td align="left" valign="bottom">Mean difference</td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">TabPFN<sup><xref ref-type="table-fn" rid="table3fn3">c</xref></sup> with pretrained augmentation and non-pretrained augmentation performs similarly (n<sub>0</sub>=50)</td><td align="left" valign="top">Pretrained augmentation performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0024</td><td align="left" valign="top">.22</td><td align="left" valign="top">&#x2212;0.0063</td><td align="left" valign="top">.97</td></tr><tr><td align="left" valign="top">LGBM<sup><xref ref-type="table-fn" rid="table3fn4">d</xref></sup> with pretrained augmentation and non-pretrained augmentation performs similarly (n<sub>0</sub>=50)</td><td align="left" valign="top">Pretrained augmentation performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0206<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">&#x2212;0.0279</td><td align="left" valign="top">.97</td></tr><tr><td align="left" valign="top">TabPFN with pretrained augmentation and non-pretrained augmentation performs similarly (n<sub>0</sub>=350)</td><td align="left" valign="top">Pretrained augmentation performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0003</td><td align="left" valign="top">.41</td><td align="left" valign="top">0.0018</td><td align="left" valign="top">.57</td></tr><tr><td align="left" valign="top">LGBM with pretrained augmentation and non-pretrained augmentation performs similarly (n<sub>0</sub>=350)</td><td align="left" valign="top">Pretrained augmentation performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0106<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table3fn5">e</xref></sup></td><td align="left" valign="top">&#x2212;0.0057</td><td align="left" valign="top">.99</td></tr></tbody></table><table-wrap-foot><fn id="table3fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table3fn2"><p><sup>b</sup>ICI: integrated calibration index.</p></fn><fn id="table3fn3"><p><sup>c</sup>TabPFN: Tabular Prior-Data Fitted Network.</p></fn><fn id="table3fn4"><p><sup>d</sup>LGBM: light gradient boosting machine.</p></fn><fn id="table3fn5"><p><sup>e</sup>Values significant at an <italic>&#x03B1;</italic> level of .05.</p></fn></table-wrap-foot></table-wrap><p>For TabPFN, pretrained augmentation did not lead to meaningful changes in discrimination at either dataset size (n<sub>0</sub>=50: &#x0394;AUC=0.0024, <italic>P</italic>=.22; n<sub>0</sub>=350: &#x0394;AUC=0.0003, <italic>P</italic>=.41).</p><p>In contrast, LGBM showed consistent and statistically significant improvements in AUC with pretrained augmentation (n<sub>0</sub>=50: &#x0394;AUC=0.0206, median AUC=0.67, (IQR:0.66-0.69), <italic>P</italic>=.02; n<sub>0</sub>=350: &#x0394;AUC=0.0106, median AUC=0.73, (IQR:0.71-0.75), <italic>P</italic>=.02), corresponding to gains of approximately 1 to 2 percentage points. These improvements were consistent in direction across 12 of the 13 datasets.</p><p>For calibration, pretrained augmentation was associated with small reductions in ICI for both models at n<sub>0</sub>=50, but these effects were modest and not statistically significant after correction for multiple testing. At n<sub>0</sub>=350, calibration differences between pretrained and non&#x2013;pretrained augmentation were negligible.</p></sec><sec id="s3-2"><title>Q2: Impact of Augmentation vs No Augmentation</title><p>We evaluated the effect of data augmentation compared to no augmentation across 13 datasets using paired permutation tests with Holm-Bonferroni correction (<xref ref-type="table" rid="table4">Table 4</xref>). Based on Q1, pretrained SDG methods were used for LGBM, while all augmentation strategies were retained for TabPFN.</p><table-wrap id="t4" position="float"><label>Table 4.</label><caption><p>Pairwise permutation tests to determine the impact of augmentation against no augmentation. <italic>P</italic> values were adjusted for multiple comparisons using the Holm-Bonferroni procedure.</p></caption><table id="table4" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Null hypothesis</td><td align="left" valign="bottom">Alternative</td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table4fn1">a</xref></sup>-based score</td><td align="left" valign="bottom" colspan="2">ICI<sup><xref ref-type="table-fn" rid="table4fn2">b</xref></sup>-based score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Mean difference</td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td><td align="left" valign="bottom">Mean difference</td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">TabPFN<sup><xref ref-type="table-fn" rid="table4fn3">c</xref></sup> with and without augmentation performs the same (n<sub>0</sub>=50)</td><td align="left" valign="top">Augmented TabPFN performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0094</td><td align="left" valign="top">.10</td><td align="left" valign="top">0.0134<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">.04<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table4fn5">e</xref></sup>-augmented LGBM<sup><xref ref-type="table-fn" rid="table4fn6">f</xref></sup> performs the same as LGBM without augmentation (n<sub>0</sub>=50)</td><td align="left" valign="top">LLM-augmented LGBM performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0764<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.0191<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">.003<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td></tr><tr><td align="left" valign="top">TabPFN with and without augmentation performs the same (n<sub>0</sub>=350)</td><td align="left" valign="top">Augmented TabPFN performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0011</td><td align="left" valign="top">.06</td><td align="left" valign="top">0.0041</td><td align="left" valign="top">.09</td></tr><tr><td align="left" valign="top">LLM-augmented LGBM performs the same as LGBM without augmentation (n<sub>0</sub>=350)</td><td align="left" valign="top">LLM-augmented LGBM performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0078<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table4fn4">d</xref></sup></td><td align="left" valign="top">0.0119</td><td align="left" valign="top">.09</td></tr></tbody></table><table-wrap-foot><fn id="table4fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table4fn2"><p><sup>b</sup>ICI: integrated calibration index.</p></fn><fn id="table4fn3"><p><sup>c</sup>TabPFN: Tabular Prior-Data Fitted Network.</p></fn><fn id="table4fn4"><p><sup>d</sup>Values significant at an <italic>&#x03B1;</italic> level of .05.</p></fn><fn id="table4fn5"><p><sup>e</sup>LLM: large language model.</p></fn><fn id="table4fn6"><p><sup>f</sup>LGBM: light gradient boosting machine.</p></fn></table-wrap-foot></table-wrap><p>For TabPFN, augmentation did not significantly affect discrimination at either dataset size (n<sub>0</sub>=50: &#x0394;AUC=0.0094, <italic>P</italic>=.10; n<sub>0</sub>=350: &#x0394;AUC=0.0011, <italic>P</italic>=.06), indicating minimal impact on AUC.</p><p>For LGBM, LLM-based augmentation resulted in statistically significant improvements in discrimination at both n<sub>0</sub>=50 (&#x0394;AUC=0.0764, <italic>P</italic>&#x003C;.001) and n<sub>0</sub>=350 (&#x0394;AUC=0.0078, <italic>P</italic>=.02), with larger gains observed in the smaller datasets.</p><p>For calibration, augmentation significantly reduced ICI at n<sub>0</sub>=50 for both TabPFN (&#x0394;ICI=0.0134, <italic>P</italic>=.04) and LGBM (&#x0394;ICI=0.0191, <italic>P</italic>=.003). At n<sub>0</sub>=350, calibration differences between augmented and nonaugmented models were small and not statistically significant.</p></sec><sec id="s3-3"><title>Q3: Comparison of Predictive Models</title><p>We compared model performance using the best configurations identified in Q1 and Q2 (<xref ref-type="table" rid="table5">Table 5</xref>).</p><table-wrap id="t5" position="float"><label>Table 5.</label><caption><p>Pairwise permutation tests to determine the best classifier. <italic>P</italic> values were adjusted for multiple comparisons using the Holm-Bonferroni procedure.</p></caption><table id="table5" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Null hypothesis</td><td align="left" valign="bottom">Alternative</td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table5fn1">a</xref></sup>-based score</td><td align="left" valign="bottom" colspan="2">ICI<sup><xref ref-type="table-fn" rid="table5fn2">b</xref></sup>-based score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"/><td align="left" valign="bottom">Mean difference</td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td><td align="left" valign="bottom">Mean difference</td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table5fn3">c</xref></sup>-augmented LGBM<sup><xref ref-type="table-fn" rid="table5fn4">d</xref></sup> and LLM classifier perform similarly (n<sub>0</sub>=50)</td><td align="left" valign="top">LLM-augmented LGBM performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0634<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.04<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.0152</td><td align="left" valign="top">.31</td></tr><tr><td align="left" valign="top">Augmented TabPFN<sup><xref ref-type="table-fn" rid="table5fn6">f</xref></sup> and LLM classifier perform similarly (n<sub>0</sub>=50)</td><td align="left" valign="top">TabPFN performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0951<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.0764<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.03<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">Augmented TabPFN and LLM-augmented LGBM perform similarly (n<sub>0</sub>=50)</td><td align="left" valign="top">TabPFN performs better (n<sub>0</sub>=50)</td><td align="left" valign="top">0.0317<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.04<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.0612<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">&#x003C;.001<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td></tr><tr><td align="left" valign="top">LLM-augmented LGBM and LLM classifier perform similarly (n<sub>0</sub>=350)</td><td align="left" valign="top">LLM-augmented LGBM performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0288<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.04<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.0383</td><td align="left" valign="top">.06</td></tr><tr><td align="left" valign="top">Augmented TabPFN and LLM classifier perform similarly (n<sub>0</sub>=350)</td><td align="left" valign="top">TabPFN performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0441<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.01<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.0484</td><td align="left" valign="top">.08</td></tr><tr><td align="left" valign="top">Augmented TabPFN and LLM-augmented LGBM perform similarly (n<sub>0</sub>=350)</td><td align="left" valign="top">TabPFN performs better (n<sub>0</sub>=350)</td><td align="left" valign="top">0.0153<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.004<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">0.0101<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td><td align="left" valign="top">.02<sup><xref ref-type="table-fn" rid="table5fn5">e</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table5fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table5fn2"><p><sup>b</sup>ICI: integrated calibration index.</p></fn><fn id="table5fn3"><p><sup>c</sup>LLM: large language model.</p></fn><fn id="table5fn4"><p><sup>d</sup>LGBM: light gradient boosting machine.</p></fn><fn id="table5fn5"><p><sup>e</sup>Values significant at an <italic>&#x03B1;</italic> level of .05.</p></fn><fn id="table5fn6"><p><sup>f</sup>TabPFN: Tabular Prior-Data Fitted Network.</p></fn></table-wrap-foot></table-wrap><p>In terms of discrimination, both LLM-augmented LGBM and augmented TabPFN significantly outperformed LLM classifiers at both dataset sizes (all <italic>P</italic>&#x003C;.05). Augmented TabPFN further achieved a significantly higher AUC than LLM-augmented LGBM (n<sub>0</sub>=50: &#x0394;AUC=0.0317, <italic>P</italic>=.04; n<sub>0</sub>=350: &#x0394;AUC=0.0153, <italic>P</italic>=.004), with a median AUC of 0.75 across datasets (IQR:0.74-0.77).</p><p>For calibration, augmented TabPFN showed significantly lower ICI than LGBM at both n<sub>0</sub>=50 (&#x0394;ICI=0.0612, <italic>P</italic>&#x003C;.001) and n<sub>0</sub>=350 (&#x0394;ICI=0.0101, <italic>P</italic>=.02). Compared to LLM classifiers, TabPFN also demonstrated better calibration at n<sub>0</sub>=50 (&#x0394;ICI=0.0764, <italic>P</italic>=.03), while differences at n<sub>0</sub>=350 were smaller and not statistically significant (<italic>P</italic>=.08).</p><p>LLM classifiers are, therefore, reported in a fixed (no augmentation) configuration, reflecting their feasible operating regime under current computational constraints.</p></sec><sec id="s3-4"><title>Post Hoc Analysis: Augmentation Strategies</title><p>To further investigate whether the observed benefits of augmentation on TabPFN were driven by increased sample size rather than the generation of synthetic data, we compared SDG methods with sampling with replacement.</p><p>We evaluated 2 comparisons: (1) LLM-based synthesis versus the best-performing augmentation method (SDG or LLM) and (2) sampling with replacement versus the best-performing augmentation method. The results are reported in <xref ref-type="table" rid="table6">Table 6</xref>.</p><table-wrap id="t6" position="float"><label>Table 6.</label><caption><p>Post hoc 2-sided pairwise permutation test to compare augmenting Tabular Prior-Data Fitted Network (TabPFN) by sampling with replacement or by synthetic data generation (SDG) methods.</p></caption><table id="table6" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Null hypothesis and n<sub>0</sub></td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table6fn1">a</xref></sup>-based score</td><td align="left" valign="bottom" colspan="2">ICI<sup><xref ref-type="table-fn" rid="table6fn2">b</xref></sup>-based score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom">Statistic</td><td align="left" valign="bottom"><italic>P</italic> value</td><td align="left" valign="bottom">Statistic</td><td align="left" valign="bottom"><italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top" colspan="5">LLM<sup><xref ref-type="table-fn" rid="table6fn3">c</xref></sup>-based synthesis vs SDG methods and LLM-based synthesis</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>50</td><td align="left" valign="top">&#x2212;0.0024</td><td align="left" valign="top">.45</td><td align="left" valign="top">&#x2212;0.0063</td><td align="left" valign="top">.41</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>350</td><td align="left" valign="top">&#x2212;0.0003</td><td align="left" valign="top">.82</td><td align="left" valign="top">0.0018</td><td align="left" valign="top">.57</td></tr><tr><td align="left" valign="top" colspan="5">Sampling with replacement vs SDG methods and LLM-based synthesis</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>50</td><td align="left" valign="top">&#x2212;0.0058</td><td align="left" valign="top">.31</td><td align="left" valign="top">0.0016</td><td align="left" valign="top">.74</td></tr><tr><td align="left" valign="top"><named-content content-type="indent">&#x00A0;&#x00A0;&#x00A0;&#x00A0;</named-content>350</td><td align="left" valign="top">&#x2212;0.0014</td><td align="left" valign="top">.41</td><td align="left" valign="top">&#x2212;0.0013</td><td align="left" valign="top">.82</td></tr></tbody></table><table-wrap-foot><fn id="table6fn1"><p><sup>a</sup>AUC: area under the curve.</p></fn><fn id="table6fn2"><p><sup>b</sup>ICI: integrated calibration index.</p></fn><fn id="table6fn3"><p><sup>c</sup>LLM: large language model.</p></fn></table-wrap-foot></table-wrap><p>No statistically significant differences were observed for either AUC or ICI at both dataset sizes (all <italic>P</italic>&#x003E;.3). Effect sizes were also negligible (|&#x0394;AUC|&#x003C;0.006; |&#x0394;ICI|&#x003C;0.007), reinforcing the absence of meaningful performance differences.</p></sec><sec id="s3-5"><title>Ranking of Methods</title><p>The pairwise permutation tests supported the following performance ordering in terms of ICI and AUC: TabPFN-augmented with resampling&#x003E;LLM-augmented LGBM&#x003E;LLM classifiers. As a post hoc validation, we additionally applied the Page trend test, a nonparametric procedure designed to evaluate ordered alternatives. This analysis provided supporting evidence for a monotonic trend in classifier performance for both ICI and AUC (<xref ref-type="table" rid="table7">Table 7</xref>), consistent with the ranking obtained from the permutation tests. Together, these results provide convergent evidence that classifier performance follows the hypothesized order.</p><table-wrap id="t7" position="float"><label>Table 7.</label><caption><p>Post hoc Page trend test (<italic>L</italic> value) results to validate rank order of classifiers&#x2019; performance.<sup><xref ref-type="table-fn" rid="table7fn1">a</xref></sup><sup>,</sup><sup><xref ref-type="table-fn" rid="table7fn2">b</xref></sup></p></caption><table id="table7" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">n<sub>0</sub></td><td align="left" valign="bottom" colspan="2">AUC<sup><xref ref-type="table-fn" rid="table7fn3">c</xref></sup>-based score</td><td align="left" valign="bottom" colspan="2">ICI<sup><xref ref-type="table-fn" rid="table7fn4">d</xref></sup>-based score</td></tr><tr><td align="left" valign="bottom"/><td align="left" valign="bottom"><italic>L</italic></td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td><td align="left" valign="bottom"><italic>L</italic></td><td align="left" valign="bottom">Corrected <italic>P</italic> value</td></tr></thead><tbody><tr><td align="left" valign="top">50</td><td align="left" valign="top">167</td><td align="left" valign="top">0.02</td><td align="left" valign="top">166</td><td align="left" valign="top">.03</td></tr><tr><td align="left" valign="top">350</td><td align="left" valign="top">172</td><td align="left" valign="top">&#x003C;.001</td><td align="left" valign="top">165</td><td align="left" valign="top">.048</td></tr></tbody></table><table-wrap-foot><fn id="table7fn1"><p><sup>a</sup>Null hypothesis: Augmented Tabular Prior-data Fitted Network (TabPFN) with sampling, large language model (LLM)-augmented light gradient boosting machine (LGBM), and LLM perform similarly.</p></fn><fn id="table7fn2"><p><sup>b</sup>Alternative: Augmented TabPFN with sampling&#x003E;LLM-augmented LGBM&#x003E;LLM.</p></fn><fn id="table7fn3"><p><sup>c</sup>AUC: area under the curve.</p></fn><fn id="table7fn4"><p><sup>d</sup>ICI: integrated calibration index.</p></fn></table-wrap-foot></table-wrap></sec><sec id="s3-6"><title>Computational Cost</title><p>Computational resource requirements constitute a critical point of differentiation between the approaches considered. LLMs typically require extensive optimization and fine-tuning, resulting in substantial computational overhead. In contrast, TabPFN operates in a zero-shot setting, thereby obviating the need for dataset-specific training and significantly reducing computational demands. This distinction is especially relevant in clinical research contexts, where access to high-performance computing resources may be limited. The comparison in <xref ref-type="table" rid="table8">Table 8</xref> illustrates the marked relative differences in training and inference time across methods.</p><table-wrap id="t8" position="float"><label>Table 8.</label><caption><p>Approximate training and inference time per dataset.</p></caption><table id="table8" frame="hsides" rules="groups"><thead><tr><td align="left" valign="bottom">Model</td><td align="left" valign="bottom">Time needed to train one model on a dataset of 1000 samples and infer on 10,000 samples</td></tr></thead><tbody><tr><td align="left" valign="top">TabPFN<sup><xref ref-type="table-fn" rid="table8fn1">a</xref></sup></td><td align="left" valign="top">6 seconds on a GPU<sup><xref ref-type="table-fn" rid="table8fn2">b</xref></sup></td></tr><tr><td align="left" valign="top">LLM<sup><xref ref-type="table-fn" rid="table8fn3">c</xref></sup> (average between the 3 considered LLMs)</td><td align="left" valign="top">12 hours on a GPU</td></tr><tr><td align="left" valign="top">LGBM<sup><xref ref-type="table-fn" rid="table8fn4">d</xref></sup></td><td align="left" valign="top">20 minutes on CPUs<sup><xref ref-type="table-fn" rid="table8fn5">e</xref></sup></td></tr></tbody></table><table-wrap-foot><fn id="table8fn1"><p><sup>a</sup>TabPFN: Tabular Prior-Data Fitted Network.</p></fn><fn id="table8fn2"><p><sup>b</sup>GPU: graphics processing unit.</p></fn><fn id="table8fn3"><p><sup>c</sup>LLM: large language model.</p></fn><fn id="table8fn4"><p><sup>d</sup>LGBM: light gradient boosting machine.</p></fn><fn id="table8fn5"><p><sup>e</sup>CPU: central processing unit.</p></fn></table-wrap-foot></table-wrap></sec></sec><sec id="s4" sec-type="discussion"><title>Discussion</title><sec id="s4-1"><title>Summary</title><p>Health datasets available for research are often small, limiting the development of robust and generalizable clinical prediction models. Data augmentation is commonly used to mitigate this issue, including SDG methods such as CTGAN, Bayesian network, TVAE, and sequential trees [<xref ref-type="bibr" rid="ref8">8</xref>,<xref ref-type="bibr" rid="ref61">61</xref>,<xref ref-type="bibr" rid="ref62">62</xref>]. However, these approaches can overfit when trained on limited data. An alternative is to leverage pretrained models, such as LLMs, to generate synthetic data without relying solely on the small dataset at hand.</p><p>In this study, we evaluated pretrained SDG using LLMs alongside traditional SDG approaches across 13 health datasets of sizes 50 and 350. This choice is consistent with current clinical prediction practices, where median training dataset sizes can be as low as 20 with a median dataset size of around 600 records [<xref ref-type="bibr" rid="ref3">3</xref>,<xref ref-type="bibr" rid="ref9">9</xref>]. We assessed their impact on 2 classifiers (LGBM and TabPFN), focusing on both discrimination and calibration. We also compared these approaches to using LLMs directly as classifiers.</p></sec><sec id="s4-2"><title>Key Findings and Practical Interpretation</title><sec id="s4-2-1"><title>Augmentation Benefits Depend on the Model</title><p>Augmentation improved LGBM performance, particularly for very small datasets, with LLM-based augmentation yielding the largest gains. In contrast, TabPFN showed little to no improvement in discrimination from augmentation (<xref ref-type="table" rid="table4">Table 4</xref>). Additionally, pretrained augmentation provided measurable gains for LGBM but not for TabPFN (<xref ref-type="table" rid="table3">Table 3</xref>), suggesting that the pretraining of TabPFN already confers robustness in low-data settings.</p><p>For practitioners working with limited data, TabPFN can be used effectively with minimal augmentation effort, while LGBM may benefit from targeted augmentation when small performance gains are meaningful. In scenarios where large external test sets are unavailable, selecting the augmentation size through 10-fold cross-validation provides stable performance estimates (Figure C1 in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>), supporting its use as a practical model selection strategy in small-data settings.</p></sec><sec id="s4-2-2"><title>Simple Methods Can Be as Effective as Complex Ones</title><p>For TabPFN, neither SDG methods nor LLM-based augmentation outperformed simple sampling with replacement (<xref ref-type="table" rid="table6">Table 6</xref>). This indicates that the primary benefit of augmentation is increasing the effective sample size rather than generating novel synthetic patterns.</p><p>In resource-limited or regulated health care environments, simple resampling should be the default strategy. It is computationally trivial, fully transparent, and avoids the risks associated with black-box generative models.</p></sec><sec id="s4-2-3"><title>LLMs Are Not Reliable as Standalone Classifiers in Low-Data Settings</title><p>Across all experiments, LLMs used directly as classifiers performed worse than both TabPFN and LGBM, particularly in calibration (<xref ref-type="table" rid="table5">Table 5</xref>), with TabPFN achieving the best overall performance. Ranking analysis further supported this ordering (<xref ref-type="table" rid="table7">Table 7</xref>). Poor calibration is especially problematic in clinical contexts where probability estimates inform decisions.</p><p>Deploying LLMs as predictive models in small clinical datasets is not advisable. Their use is better suited for augmentation or other supportive roles rather than direct prediction.</p></sec><sec id="s4-2-4"><title>Calibration Improvements Are Limited but Important</title><p>Calibration improvements were observed primarily in the smallest datasets (<xref ref-type="table" rid="table4">Table 4</xref>), with no significant effects observed at larger sample sizes.</p><p>In small datasets, augmentation can still be valuable for improving the reliability of predicted probabilities, even when discrimination gains are modest.</p></sec><sec id="s4-2-5"><title>Model Choice Matters More as Data Increases</title><p>At n<sub>0</sub>=350, augmentation effects were smaller, and differences between models became more important (<xref ref-type="table" rid="table4">Table 4</xref>).</p><p>When moderate data is available, selecting an appropriate model (eg, TabPFN vs LGBM) is more impactful than investing in complex augmentation strategies.</p></sec><sec id="s4-2-6"><title>Trustworthy and Resource-Constrained AI Considerations</title><p>A central finding of this study is that complex SDG does not outperform simple, transparent alternatives in small-data clinical settings. The absence of measurable gains from SDG or LLM-based augmentation over resampling (<xref ref-type="table" rid="table6">Table 6</xref>) highlights that an increased sample size is the main driver of performance improvements. This has direct implications for trustworthy AI:</p><list list-type="bullet"><list-item><p xml:lang="en-ca">Transparency: Sampling with replacement provides a clear and traceable data generation process, unlike LLMs or SDG methods.</p></list-item><list-item><p xml:lang="en-ca">Safety: Avoiding black-box generation reduces the risk of introducing unrealistic or misleading synthetic patient data.</p></list-item><list-item><p xml:lang="en-ca">Regulatory alignment: Simpler methods are easier to justify and validate in clinical and regulatory contexts.</p></list-item><list-item><p xml:lang="en-ca">Efficiency: TabPFN requires seconds to run, compared to hours for LLMs, making it accessible in low-resource environments.</p></list-item></list><p>Similarly, TabPFN combines strong predictive performance (<xref ref-type="table" rid="table5">Table 5</xref>) with minimal computational requirements (<xref ref-type="table" rid="table8">Table 8</xref>), making it particularly suitable for environments with limited resources.</p><p>Although LLM classifiers were not evaluated under augmentation due to practical computational constraints, this reflects a realistic deployment limitation rather than an experimental omission. Unlike TabPFN and LGBM, LLM fine-tuning with varying synthetic sample sizes is computationally prohibitive at scale, requiring hundreds of graphics processing unit (GPU)&#x2013;days (see Section D in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref>). As such, the comparison reflects practical usability under resource-constrained conditions rather than fully symmetric experimental tuning.</p></sec><sec id="s4-2-7"><title>Fidelity and Role of Synthetic Data</title><p>The post hoc analysis suggests that improvements from augmentation are primarily driven by increased sample size rather than the generation of novel synthetic data (<xref ref-type="table" rid="table6">Table 6</xref>).</p><p>While we performed basic realism checks to ensure that synthetic records respect plausible clinical ranges and preserve key distributions, perfect fidelity to the original data is not required for predictive tasks. Prior work [<xref ref-type="bibr" rid="ref63">63</xref>] has shown that out-of-population observations can be common in synthetic datasets without necessarily degrading predictive performance. Similarly, generative models may improve generalization by increasing data diversity [<xref ref-type="bibr" rid="ref8">8</xref>].</p><p>Taken together, these findings indicate that the benefits of augmentation are more closely related to increased sample size and diversity than to the exact replication of the original data distribution.</p></sec></sec><sec id="s4-3"><title>Limitations and Future Work</title><p>This study focused on binary classification tasks and may not generalize to other settings, such as regression or survival analysis. Although we evaluated multiple datasets, further validation across diverse clinical contexts is needed.</p><p>While we evaluated 13 heterogeneous health datasets, they may not represent the full diversity of clinical data environments. Datasets with high dimensionality, extreme class imbalance, or strong temporal structure may behave differently under augmentation strategies. External validation on additional real-world datasets is required to strengthen generalizability.</p><p>Our experimental design relied on relatively large hold-out test sets to ensure stable estimation of discrimination and calibration metrics. In many real-world health care settings, such large external test sets are unavailable. Although our cross-validation analysis suggests that model selection through 10-fold cross-validation provides comparable estimates, performance variability may be higher in practice, particularly in ultra-small datasets.</p><p>Some of the non-pretrained models that were used in our analysis, such as CTGAN and TVAE, used the default hyperparameters and did not undergo additional tuning. While this may have limited their performance, sensitivity analyses presented in Section B in <xref ref-type="supplementary-material" rid="app1">Multimedia Appendix 1</xref> revealed that attempting to tune these hyperparameters on very small datasets can lead to overfitting. Specifically, we observed that the multivariate Hellinger distance (which is a common synthetic data fidelity metric) between the data from the generative models and sampling with replacement was lower when tuning compared to using default parameters. The sampling with replacement dataset serves as a baseline, indicating a high rate of overfitting. The fact that the fidelity was higher to the overfitted data with hyperparameter tuning indicates that tuning results in datasets that are closer to resampled data, which is highly overfitted. This highlights a practical limitation: in low-data settings, more complex model tuning may be counterproductive, and default configurations&#x2014;or simpler augmentation strategies&#x2014;can yield more reliable and stable results.</p><p>To adapt pretrained LLMs for tabular data, we used a serialization method from previous studies [<xref ref-type="bibr" rid="ref33">33</xref>,<xref ref-type="bibr" rid="ref35">35</xref>] that places column names before values, thereby preserving contextual information. While this creates artificial sentences, fine-tuning is thought to mitigate this issue. We did not explore alternative serialization methods, which could be an area for future research. Future work should explore broader classes of models, alternative data modalities, and evaluation in real-world deployment settings.</p></sec></sec></body><back><ack><p>Generative artificial intelligence was not used in the writing of this paper.</p></ack><notes><sec><title>Funding</title><p>This research is funded by the Canada Research Chairs program through the Canadian Institutes of Health Research, a Discovery Grant RGPIN-2022-04811 from the Natural Sciences and Engineering Research Council of Canada, and the Canadian Children's Inflammatory Bowel Disease Network. LP is funded by the Deutsche Forschungsgemeinschaft (DFG, German Research Foundation): 530282197.</p></sec><sec><title>Data Availability</title><p>The following provides information on the availability of each of the datasets used in this study:</p><p>1. Better Outcomes Registry &#x0026; Network (BORN) |: The BORN collects Ontario&#x2019;s prescribed perinatal, newborn, and child registry with the role of facilitating quality care for families across the province [<xref ref-type="bibr" rid="ref64">64</xref>].</p><p>2. Basic Stand Alone (BSA): The BSA inpatient claims dataset is about claim-level information, where each record is an inpatient claim incurred by a 5% sample of Medicare beneficiaries. [<xref ref-type="bibr" rid="ref65">65</xref>]</p><p>3. California State Hospital Discharge: The California dataset contains the patient hospital discharge data from 2008, sourced from the California State Inpatient Databases (SID), Healthcare Cost and Utilization Project (HCUP), and the Agency for Healthcare Research and Quality [<xref ref-type="bibr" rid="ref66">66</xref>].</p><p>4. Canadian Community Health Survey (CCHS): The CCHS data consist of Canadian population-level information concerning health status, health system utilization, and health determinants, collected by Statistics Canada through telephone surveys [<xref ref-type="bibr" rid="ref67">67</xref>].</p><p>5. COVID-19: The COVID-19 dataset collects Canadian health records related to COVID-19, gathered by the Public Health Agency of Canada, and is available on Esri Canada's platform [<xref ref-type="bibr" rid="ref68">68</xref>].</p><p>6. FDA Adverse Event Reporting System (FAERS): The FAERS is a database comprising information on adverse events and medication error reports submitted to the FDA and can be downloaded [<xref ref-type="bibr" rid="ref69">69</xref>].</p><p>7. Florida State Hospital Discharge: The Florida dataset contains the patient&#x2019;s hospital 2007 discharge data from the Florida SID, HCUP, Agency for Healthcare Research and Quality [<xref ref-type="bibr" rid="ref66">66</xref>], and is available for purchase.</p><p>8. Medical Information Mart for Intensive Care III (MIMIC-III): MIMIC-III is a large database that contains deidentified health-related data associated with more than 40,000 patients who stayed in critical care units of the Beth Israel Deaconess Medical Center between 2001 and 2012 [<xref ref-type="bibr" rid="ref70">70</xref>,<xref ref-type="bibr" rid="ref71">71</xref>]. Access to the MIMIC database is granted upon signing a data use agreement with PhysioNet [<xref ref-type="bibr" rid="ref70">70</xref>-<xref ref-type="bibr" rid="ref72">72</xref>].</p><p>9. New York State Hospital Discharge: The New York dataset contains the patient hospital discharge data from 2007, sourced from the New York SID, HCUP, and the Agency for Healthcare Research and Quality [<xref ref-type="bibr" rid="ref66">66</xref>]</p><p>10. COVID-19 Survival (Nexoid): The COVID-19 survival dataset is a web-based survey dataset collected by a company called Nexoid in the United Kingdom. It is publicly available [<xref ref-type="bibr" rid="ref73">73</xref>].</p><p>11. Texas Hospital Discharge: The Texas dataset contains the patient hospital discharge information for the first quarter of 2012 from Texas in the United States [<xref ref-type="bibr" rid="ref74">74</xref>] and is publicly available.</p><p>12. Washington State Hospital Discharge 2007: The Washington dataset contains the patient hospital discharge data from 2007, sourced from the Washington SID, HCUP, and the Agency for Healthcare Research and Quality [<xref ref-type="bibr" rid="ref66">66</xref>]. It is available for purchase.</p><p>13. Washington State Hospital Discharge 2008: The Washington 2008 dataset contains the patient hospital discharge data from 2008, sourced from the Washington SID, HCUP, and the Agency for Healthcare Research and Quality [<xref ref-type="bibr" rid="ref66">66</xref>]. It is available for purchase.</p></sec></notes><fn-group><fn fn-type="conflict"><p>LP has financial interests in Woodway Assurance, a privacy technology spin-off company from her research lab at the University of Ottawa, but it is not related to the topic of this study. KEE is co&#x2013;editor-in-chief of <italic>JMIR AI</italic>. FKD is an associate editor of <italic>JMIR AI</italic>.</p></fn></fn-group><glossary><title>Abbreviations</title><def-list><def-item><term id="abb1">AUC</term><def><p>area under the curve</p></def></def-item><def-item><term id="abb2">CTGAN</term><def><p>conditional tabular generative adversarial network</p></def></def-item><def-item><term id="abb3">GPU</term><def><p>graphics processing unit</p></def></def-item><def-item><term id="abb4">ICI</term><def><p>integrated calibration index</p></def></def-item><def-item><term id="abb5">LGBM</term><def><p>light gradient boosting machine</p></def></def-item><def-item><term id="abb6">LLM</term><def><p>large language model</p></def></def-item><def-item><term id="abb7">LoRA</term><def><p>low-rank adaptation</p></def></def-item><def-item><term id="abb8">SDG</term><def><p>synthetic data generation</p></def></def-item><def-item><term id="abb9">TabPFN</term><def><p>Tabular Prior-Data Fitted Network</p></def></def-item><def-item><term id="abb10">TVAE</term><def><p>tabular variational autoencoder</p></def></def-item></def-list></glossary><ref-list><title>References</title><ref id="ref1"><label>1</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>van der Ploeg</surname><given-names>T</given-names> </name><name name-style="western"><surname>Austin</surname><given-names>PC</given-names> </name><name name-style="western"><surname>Steyerberg</surname><given-names>EW</given-names> </name></person-group><article-title>Modern modelling techniques are data hungry: a simulation study for predicting dichotomous endpoints</article-title><source>BMC Med Res Methodol</source><year>2014</year><month>12</month><day>22</day><volume>14</volume><fpage>137</fpage><pub-id pub-id-type="doi">10.1186/1471-2288-14-137</pub-id><pub-id pub-id-type="medline">25532820</pub-id></nlm-citation></ref><ref id="ref2"><label>2</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Collins</surname><given-names>GS</given-names> </name></person-group><article-title>Stability of clinical prediction models developed using statistical or machine learning methods</article-title><source>Biom J</source><year>2023</year><month>12</month><volume>65</volume><issue>8</issue><fpage>e2200302</fpage><pub-id pub-id-type="doi">10.1002/bimj.202200302</pub-id><pub-id pub-id-type="medline">37466257</pub-id></nlm-citation></ref><ref id="ref3"><label>3</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Andaur Navarro</surname><given-names>CL</given-names> </name><name name-style="western"><surname>Damen</surname><given-names>JAA</given-names> </name><name name-style="western"><surname>van Smeden</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Systematic review identifies the design and methodological conduct of studies on machine learning-based prediction models</article-title><source>J Clin Epidemiol</source><year>2023</year><month>02</month><volume>154</volume><fpage>8</fpage><lpage>22</lpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2022.11.015</pub-id><pub-id pub-id-type="medline">36436815</pub-id></nlm-citation></ref><ref id="ref4"><label>4</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Riley</surname><given-names>RD</given-names> </name><name name-style="western"><surname>Ensor</surname><given-names>J</given-names> </name><name name-style="western"><surname>Snell</surname><given-names>KIE</given-names> </name><etal/></person-group><article-title>Importance of sample size on the quality and utility of AI-based prediction models for healthcare</article-title><source>Lancet Digit Health</source><year>2025</year><month>06</month><volume>7</volume><issue>6</issue><fpage>100857</fpage><pub-id pub-id-type="doi">10.1016/j.landig.2025.01.013</pub-id><pub-id pub-id-type="medline">40461350</pub-id></nlm-citation></ref><ref id="ref5"><label>5</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pongsuwun</surname><given-names>K</given-names> </name><name name-style="western"><surname>Puwarawuttipanit</surname><given-names>W</given-names> </name><name name-style="western"><surname>Nguantad</surname><given-names>S</given-names> </name><etal/></person-group><article-title>A systematic review of the accuracy of machine learning models for diagnosing pulmonary tuberculosis: implications for nursing practice and implementation</article-title><source>Nurs Health Sci</source><year>2025</year><month>03</month><volume>27</volume><issue>1</issue><fpage>e70077</fpage><pub-id pub-id-type="doi">10.1111/nhs.70077</pub-id><pub-id pub-id-type="medline">40058367</pub-id></nlm-citation></ref><ref id="ref6"><label>6</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Tsegaye</surname><given-names>B</given-names> </name><name name-style="western"><surname>Snell</surname><given-names>KIE</given-names> </name><name name-style="western"><surname>Archer</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Larger sample sizes are needed when developing a clinical prediction model using machine learning in oncology: methodological systematic review</article-title><source>J Clin Epidemiol</source><year>2025</year><month>04</month><volume>180</volume><fpage>111675</fpage><pub-id pub-id-type="doi">10.1016/j.jclinepi.2025.111675</pub-id><pub-id pub-id-type="medline">39814217</pub-id></nlm-citation></ref><ref id="ref7"><label>7</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mitsakakis</surname><given-names>N</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Walters</surname><given-names>T</given-names> </name><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name></person-group><article-title>Sample size calculation for training ensemble machine learning models on health data</article-title><source>Patterns</source><year>2026</year><month>03</month><fpage>101498</fpage><pub-id pub-id-type="doi">10.1016/j.patter.2026.101498</pub-id></nlm-citation></ref><ref id="ref8"><label>8</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Liu</surname><given-names>D</given-names> </name><name name-style="western"><surname>Kababji</surname><given-names>SE</given-names> </name><name name-style="western"><surname>Mitsakakis</surname><given-names>N</given-names> </name><etal/></person-group><article-title>Augmenting small tabular health data for training prognostic ensemble machine learning models using generative models</article-title><source>BMC Med Inform Decis Mak</source><year>2025</year><month>11</month><day>28</day><volume>25</volume><issue>1</issue><fpage>435</fpage><pub-id pub-id-type="doi">10.1186/s12911-025-03266-3</pub-id><pub-id pub-id-type="medline">41316099</pub-id></nlm-citation></ref><ref id="ref9"><label>9</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Dhiman</surname><given-names>P</given-names> </name><name name-style="western"><surname>Ma</surname><given-names>J</given-names> </name><name name-style="western"><surname>Andaur Navarro</surname><given-names>CL</given-names> </name><etal/></person-group><article-title>Methodological conduct of prognostic prediction models developed using machine learning in oncology: a systematic review</article-title><source>BMC Med Res Methodol</source><year>2022</year><month>04</month><day>8</day><volume>22</volume><issue>1</issue><fpage>101</fpage><pub-id pub-id-type="doi">10.1186/s12874-022-01577-x</pub-id><pub-id pub-id-type="medline">35395724</pub-id></nlm-citation></ref><ref id="ref10"><label>10</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Mumuni</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mumuni</surname><given-names>F</given-names> </name></person-group><article-title>Data augmentation: a comprehensive survey of modern approaches</article-title><source>Array</source><year>2022</year><month>12</month><volume>16</volume><fpage>100258</fpage><pub-id pub-id-type="doi">10.1016/j.array.2022.100258</pub-id></nlm-citation></ref><ref id="ref11"><label>11</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shorten</surname><given-names>C</given-names> </name><name name-style="western"><surname>Khoshgoftaar</surname><given-names>TM</given-names> </name></person-group><article-title>A survey on image data augmentation for deep learning</article-title><source>J Big Data</source><year>2019</year><month>12</month><volume>6</volume><issue>1</issue><fpage>60</fpage><pub-id pub-id-type="doi">10.1186/s40537-019-0197-0</pub-id></nlm-citation></ref><ref id="ref12"><label>12</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goceri</surname><given-names>E</given-names> </name></person-group><article-title>Medical image data augmentation: techniques, comparisons and interpretations</article-title><source>Artif Intell Rev</source><year>2023</year><month>03</month><day>20</day><volume>56</volume><fpage>12561</fpage><lpage>12605</lpage><pub-id pub-id-type="doi">10.1007/s10462-023-10453-z</pub-id><pub-id pub-id-type="medline">37362888</pub-id></nlm-citation></ref><ref id="ref13"><label>13</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Naveed</surname><given-names>H</given-names> </name><name name-style="western"><surname>Anwar</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hayat</surname><given-names>M</given-names> </name><name name-style="western"><surname>Javed</surname><given-names>K</given-names> </name><name name-style="western"><surname>Mian</surname><given-names>A</given-names> </name></person-group><article-title>Survey: image mixing and deleting for data augmentation</article-title><source>Eng Appl Artif Intell</source><year>2024</year><month>05</month><volume>131</volume><fpage>107791</fpage><pub-id pub-id-type="doi">10.1016/j.engappai.2023.107791</pub-id></nlm-citation></ref><ref id="ref14"><label>14</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Feng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Gangal</surname><given-names>V</given-names> </name><name name-style="western"><surname>Wei</surname><given-names>J</given-names> </name><etal/></person-group><article-title>A survey of data augmentation approaches for NLP</article-title><conf-name>Findings of the Association for Computational Linguistics</conf-name><conf-date>Aug 1-6, 2021</conf-date><pub-id pub-id-type="doi">10.18653/v1/2021.findings-acl.84</pub-id></nlm-citation></ref><ref id="ref15"><label>15</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Chlap</surname><given-names>P</given-names> </name><name name-style="western"><surname>Min</surname><given-names>H</given-names> </name><name name-style="western"><surname>Vandenberg</surname><given-names>N</given-names> </name><name name-style="western"><surname>Dowling</surname><given-names>J</given-names> </name><name name-style="western"><surname>Holloway</surname><given-names>L</given-names> </name><name name-style="western"><surname>Haworth</surname><given-names>A</given-names> </name></person-group><article-title>A review of medical image data augmentation techniques for deep learning applications</article-title><source>J Med Imaging Radiat Oncol</source><year>2021</year><month>08</month><volume>65</volume><issue>5</issue><fpage>545</fpage><lpage>563</lpage><pub-id pub-id-type="doi">10.1111/1754-9485.13261</pub-id><pub-id pub-id-type="medline">34145766</pub-id></nlm-citation></ref><ref id="ref16"><label>16</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Duong</surname><given-names>HT</given-names> </name><name name-style="western"><surname>Nguyen-Thi</surname><given-names>TA</given-names> </name></person-group><article-title>A review: preprocessing techniques and data augmentation for sentiment analysis</article-title><source>Comput Soc Netw</source><year>2021</year><month>12</month><volume>8</volume><issue>1</issue><fpage>1</fpage><pub-id pub-id-type="doi">10.1186/s40649-020-00080-x</pub-id></nlm-citation></ref><ref id="ref17"><label>17</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Felix</surname><given-names>EA</given-names> </name><name name-style="western"><surname>Lee</surname><given-names>SP</given-names> </name></person-group><article-title>Systematic literature review of preprocessing techniques for imbalanced data</article-title><source>IET Softw</source><year>2019</year><month>12</month><volume>13</volume><issue>6</issue><fpage>479</fpage><lpage>496</lpage><pub-id pub-id-type="doi">10.1049/iet-sen.2018.5193</pub-id></nlm-citation></ref><ref id="ref18"><label>18</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Wen</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Sun</surname><given-names>L</given-names> </name><name name-style="western"><surname>Yang</surname><given-names>F</given-names> </name><etal/></person-group><article-title>Time series data augmentation for deep learning: a survey</article-title><conf-name>Thirtieth International Joint Conference on Artificial Intelligence (IJCAI-21)</conf-name><conf-date>Aug 19-26, 2021</conf-date><pub-id pub-id-type="doi">10.24963/ijcai.2021/631</pub-id></nlm-citation></ref><ref id="ref19"><label>19</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Iwana</surname><given-names>BK</given-names> </name><name name-style="western"><surname>Uchida</surname><given-names>S</given-names> </name></person-group><article-title>An empirical survey of data augmentation for time series classification with neural networks</article-title><source>PLOS ONE</source><year>2021</year><volume>16</volume><issue>7</issue><fpage>e0254841</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0254841</pub-id><pub-id pub-id-type="medline">34264999</pub-id></nlm-citation></ref><ref id="ref20"><label>20</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name><name name-style="western"><surname>Skoularidou</surname><given-names>M</given-names> </name><name name-style="western"><surname>Cuesta-Infante</surname><given-names>A</given-names> </name><name name-style="western"><surname>Veeramachaneni</surname><given-names>K</given-names> </name></person-group><article-title>Modeling tabular data using conditional GAN</article-title><access-date>2026-05-21</access-date><conf-name>33rd Conference on Neural Information Processing Systems (NeurIPS 2019)</conf-name><conf-date>Dec 8-14, 2019</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2019/file/254ed7d2de3b23ab10936522dd547b78-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2019/file/254ed7d2de3b23ab10936522dd547b78-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref21"><label>21</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Kiran</surname><given-names>A</given-names> </name><name name-style="western"><surname>Rubini</surname><given-names>P</given-names> </name><name name-style="western"><surname>Kumar</surname><given-names>SS</given-names> </name></person-group><article-title>Challenges and limitations of TVAE tabular synthetic data generator</article-title><source>Advanced Computing</source><year>2025</year><publisher-name>Springer</publisher-name><fpage>243</fpage><lpage>254</lpage><pub-id pub-id-type="doi">10.1007/978-3-031-84602-1_17</pub-id></nlm-citation></ref><ref id="ref22"><label>22</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Kaur</surname><given-names>D</given-names> </name><name name-style="western"><surname>Sobiesk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Patil</surname><given-names>S</given-names> </name><etal/></person-group><article-title>Application of Bayesian networks to generate synthetic health data</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>03</month><day>18</day><volume>28</volume><issue>4</issue><fpage>801</fpage><lpage>811</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa303</pub-id><pub-id pub-id-type="medline">33367620</pub-id></nlm-citation></ref><ref id="ref23"><label>23</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Gogoshin</surname><given-names>G</given-names> </name><name name-style="western"><surname>Branciamore</surname><given-names>S</given-names> </name><name name-style="western"><surname>Rodin</surname><given-names>AS</given-names> </name></person-group><article-title>Synthetic data generation with probabilistic Bayesian networks</article-title><source>Math Biosci Eng</source><year>2021</year><month>10</month><day>9</day><volume>18</volume><issue>6</issue><fpage>8603</fpage><lpage>8621</lpage><pub-id pub-id-type="doi">10.3934/mbe.2021426</pub-id><pub-id pub-id-type="medline">34814315</pub-id></nlm-citation></ref><ref id="ref24"><label>24</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Martins</surname><given-names>LNA</given-names> </name><name name-style="western"><surname>Gon&#x00E7;alves</surname><given-names>FB</given-names> </name><name name-style="western"><surname>Galletti</surname><given-names>TP</given-names> </name></person-group><article-title>Generation and analysis of synthetic data via Bayesian networks: a robust approach for uncertainty quantification via Bayesian paradigm</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 27, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.17915</pub-id></nlm-citation></ref><ref id="ref25"><label>25</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Deeva</surname><given-names>I</given-names> </name><name name-style="western"><surname>Andriushchenko</surname><given-names>PD</given-names> </name><name name-style="western"><surname>Kalyuzhnaya</surname><given-names>AV</given-names> </name><name name-style="western"><surname>Boukhanovsky</surname><given-names>AV</given-names> </name></person-group><article-title>Bayesian networks-based personal data synthesis</article-title><conf-name>Proceedings of the 6th EAI International Conference on Smart Objects and Technologies for Social Good</conf-name><conf-date>Sep 14-16, 2020</conf-date><pub-id pub-id-type="doi">10.1145/3411170.3411243</pub-id></nlm-citation></ref><ref id="ref26"><label>26</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Emam</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Mosquera</surname><given-names>L</given-names> </name><name name-style="western"><surname>Zheng</surname><given-names>C</given-names> </name></person-group><article-title>Optimizing the synthesis of clinical trial data using sequential trees</article-title><source>J Am Med Inform Assoc</source><year>2021</year><month>01</month><day>15</day><volume>28</volume><issue>1</issue><fpage>3</fpage><lpage>13</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaa249</pub-id><pub-id pub-id-type="medline">33186440</pub-id></nlm-citation></ref><ref id="ref27"><label>27</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Drechsler</surname><given-names>J</given-names> </name><name name-style="western"><surname>Reiter</surname><given-names>JP</given-names> </name></person-group><article-title>An empirical evaluation of easily implemented, nonparametric methods for generating synthetic datasets</article-title><source>Comput Stat Data Anal</source><year>2011</year><month>12</month><volume>55</volume><issue>12</issue><fpage>3232</fpage><lpage>3243</lpage><pub-id pub-id-type="doi">10.1016/j.csda.2011.06.006</pub-id></nlm-citation></ref><ref id="ref28"><label>28</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nowok</surname><given-names>B</given-names> </name></person-group><article-title>Utility of synthetic microdata generated using tree-based methods</article-title><access-date>2026-05-21</access-date><conf-name>Joint UNECE/Eurostat Work Session on Statistical Data Confidentiality (Nowok, 2015)</conf-name><conf-date>Oct 5-7, 2015</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://unece.org/sites/default/files/datastore/fileadmin/DAM/stats/documents/ece/ces/ge.46/20150/Paper_33_Session_2_-_Univ._Edinburgh__Nowok_.pdf">https://unece.org/sites/default/files/datastore/fileadmin/DAM/stats/documents/ece/ces/ge.46/20150/Paper_33_Session_2_-_Univ._Edinburgh__Nowok_.pdf</ext-link></comment></nlm-citation></ref><ref id="ref29"><label>29</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Reiter</surname><given-names>J</given-names> </name></person-group><article-title>Using CART to generate partially synthetic, public use microdata</article-title><source>J Off Stat</source><year>2005</year><access-date>2026-05-21</access-date><volume>21</volume><issue>3</issue><fpage>441</fpage><lpage>462</lpage><comment><ext-link ext-link-type="uri" xlink:href="https://www.scb.se/contentassets/ca21efb41fee47d293bbee5bf7be7fb3/using-cart-to-generate-partially-synthetic-public-use-microdata.pdf">https://www.scb.se/contentassets/ca21efb41fee47d293bbee5bf7be7fb3/using-cart-to-generate-partially-synthetic-public-use-microdata.pdf</ext-link></comment></nlm-citation></ref><ref id="ref30"><label>30</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Barr</surname><given-names>AA</given-names> </name><name name-style="western"><surname>Quan</surname><given-names>J</given-names> </name><name name-style="western"><surname>Guo</surname><given-names>E</given-names> </name><name name-style="western"><surname>Sezgin</surname><given-names>E</given-names> </name></person-group><article-title>Large language models generating synthetic clinical datasets: a feasibility and comparative analysis with real-world perioperative data</article-title><source>Front Artif Intell</source><year>2025</year><volume>8</volume><fpage>1533508</fpage><pub-id pub-id-type="doi">10.3389/frai.2025.1533508</pub-id><pub-id pub-id-type="medline">39974356</pub-id></nlm-citation></ref><ref id="ref31"><label>31</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Fang</surname><given-names>X</given-names> </name><name name-style="western"><surname>Xu</surname><given-names>W</given-names> </name><name name-style="western"><surname>Tan</surname><given-names>FA</given-names> </name><etal/></person-group><article-title>Large language models (LLMs) on tabular data: prediction, generation, and understanding&#x2014;a survey</article-title><source>arXiv</source><comment>Preprint posted online on  Feb 27, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2402.17944</pub-id></nlm-citation></ref><ref id="ref32"><label>32</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Borisov</surname><given-names>V</given-names> </name><name name-style="western"><surname>Se&#x00DF;ler</surname><given-names>K</given-names> </name><name name-style="western"><surname>Leemann</surname><given-names>T</given-names> </name><name name-style="western"><surname>Pawelczyk</surname><given-names>M</given-names> </name><name name-style="western"><surname>Kasneci</surname><given-names>G</given-names> </name></person-group><article-title>Language models are realistic tabular data generators</article-title><source>arXiv</source><comment>Preprint posted online on  Oct 12, 2022</comment><pub-id pub-id-type="doi">10.48550/arXiv.2210.06280</pub-id></nlm-citation></ref><ref id="ref33"><label>33</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Seedat</surname><given-names>N</given-names> </name><name name-style="western"><surname>Huynh</surname><given-names>N</given-names> </name><name name-style="western"><surname>van</surname><given-names>BB</given-names> </name><etal/></person-group><article-title>Curated LLM: synergy of LLMs and data curation for tabular augmentation in low-data regimes</article-title><conf-name>International Conference on Machine Learning (ICML 2024)</conf-name><conf-date>Jul 21-27, 2024</conf-date><pub-id pub-id-type="doi">10.5555/3692070.3693865</pub-id></nlm-citation></ref><ref id="ref34"><label>34</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Isomura</surname><given-names>T</given-names> </name><name name-style="western"><surname>Shimizu</surname><given-names>R</given-names> </name><name name-style="western"><surname>Goto</surname><given-names>M</given-names> </name></person-group><article-title>LLMOverTab: tabular data augmentation with language model-driven oversampling</article-title><source>Expert Syst Appl</source><year>2025</year><month>03</month><volume>264</volume><fpage>125852</fpage><pub-id pub-id-type="doi">10.1016/j.eswa.2024.125852</pub-id></nlm-citation></ref><ref id="ref35"><label>35</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Nguyen</surname><given-names>D</given-names> </name><name name-style="western"><surname>Gupta</surname><given-names>S</given-names> </name><name name-style="western"><surname>Do</surname><given-names>K</given-names> </name><name name-style="western"><surname>Nguyen</surname><given-names>T</given-names> </name><name name-style="western"><surname>Venkatesh</surname><given-names>S</given-names> </name></person-group><article-title>Generating realistic tabular data with large language models</article-title><conf-name>2024 IEEE International Conference on Data Mining (ICDM)</conf-name><conf-date>Dec 9-12, 2024</conf-date><pub-id pub-id-type="doi">10.1109/ICDM59182.2024.00040</pub-id></nlm-citation></ref><ref id="ref36"><label>36</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Brown</surname><given-names>KE</given-names> </name><name name-style="western"><surname>Yan</surname><given-names>C</given-names> </name><name name-style="western"><surname>Li</surname><given-names>Z</given-names> </name><etal/></person-group><article-title>Large language models are less effective at clinical prediction tasks than locally trained machine learning models</article-title><source>J Am Med Inform Assoc</source><year>2025</year><month>05</month><day>1</day><volume>32</volume><issue>5</issue><fpage>811</fpage><lpage>822</lpage><pub-id pub-id-type="doi">10.1093/jamia/ocaf038</pub-id><pub-id pub-id-type="medline">40056436</pub-id></nlm-citation></ref><ref id="ref37"><label>37</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Hollmann</surname><given-names>N</given-names> </name><name name-style="western"><surname>M&#x00FC;ller</surname><given-names>S</given-names> </name><name name-style="western"><surname>Purucker</surname><given-names>L</given-names> </name><etal/></person-group><article-title>Accurate predictions on small data with a tabular foundation model</article-title><source>Nature</source><year>2025</year><month>01</month><volume>637</volume><issue>8045</issue><fpage>319</fpage><lpage>326</lpage><pub-id pub-id-type="doi">10.1038/s41586-024-08328-6</pub-id><pub-id pub-id-type="medline">39780007</pub-id></nlm-citation></ref><ref id="ref38"><label>38</label><nlm-citation citation-type="thesis"><person-group person-group-type="author"><name name-style="western"><surname>Xu</surname><given-names>L</given-names> </name></person-group><article-title>Synthesizing tabular data using conditional GAN [Master&#x2019;s thesis]</article-title><year>2020</year><access-date>2026-05-21</access-date><publisher-name>Massachusetts Institute of Technology</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://dspace.mit.edu/entities/publication/79844ce9-4b05-4be9-acdc-568c9483c51d">https://dspace.mit.edu/entities/publication/79844ce9-4b05-4be9-acdc-568c9483c51d</ext-link></comment></nlm-citation></ref><ref id="ref39"><label>39</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Habibi</surname><given-names>O</given-names> </name><name name-style="western"><surname>Chemmakha</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lazaar</surname><given-names>M</given-names> </name></person-group><article-title>Imbalanced tabular data modelization using CTGAN and machine learning to improve IoT Botnet attacks detection</article-title><source>Eng Appl Artif Intell</source><year>2023</year><month>02</month><volume>118</volume><fpage>105669</fpage><pub-id pub-id-type="doi">10.1016/j.engappai.2022.105669</pub-id></nlm-citation></ref><ref id="ref40"><label>40</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bourou</surname><given-names>S</given-names> </name><name name-style="western"><surname>El Saer</surname><given-names>A</given-names> </name><name name-style="western"><surname>Velivassaki</surname><given-names>TH</given-names> </name><name name-style="western"><surname>Voulkidis</surname><given-names>A</given-names> </name><name name-style="western"><surname>Zahariadis</surname><given-names>T</given-names> </name></person-group><article-title>A review of tabular data synthesis using GANs on an IDS dataset</article-title><source>Information</source><year>2021</year><volume>12</volume><issue>9</issue><fpage>375</fpage><pub-id pub-id-type="doi">10.3390/info12090375</pub-id></nlm-citation></ref><ref id="ref41"><label>41</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pathare</surname><given-names>A</given-names> </name><name name-style="western"><surname>Mangrulkar</surname><given-names>R</given-names> </name><name name-style="western"><surname>Suvarna</surname><given-names>K</given-names> </name><name name-style="western"><surname>Parekh</surname><given-names>A</given-names> </name><name name-style="western"><surname>Thakur</surname><given-names>G</given-names> </name><name name-style="western"><surname>Gawade</surname><given-names>A</given-names> </name></person-group><article-title>Comparison of tabular synthetic data generation techniques using propensity and cluster log metric</article-title><source>Int J Inf Manag Data Insights</source><year>2023</year><month>11</month><volume>3</volume><issue>2</issue><fpage>100177</fpage><pub-id pub-id-type="doi">10.1016/j.jjimei.2023.100177</pub-id></nlm-citation></ref><ref id="ref42"><label>42</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Farhadyar</surname><given-names>K</given-names> </name><name name-style="western"><surname>Bonofiglio</surname><given-names>F</given-names> </name><name name-style="western"><surname>Zoeller</surname><given-names>D</given-names> </name><name name-style="western"><surname>Binder</surname><given-names>H</given-names></name></person-group><article-title>Adapting deep generative approaches for getting synthetic data with realistic marginal distributions</article-title><source>arXiv</source><comment>Preprint posted online on  May 14, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2105.06907</pub-id></nlm-citation></ref><ref id="ref43"><label>43</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Qian</surname><given-names>Z</given-names> </name><name name-style="western"><surname>Cebere</surname><given-names>BC</given-names> </name><name name-style="western"><surname>van der Schaar</surname><given-names>M</given-names> </name></person-group><article-title>Synthcity: facilitating innovative use cases of synthetic data in different data modalities</article-title><source>arXiv</source><comment>Preprint posted online on  Jan 18, 2023</comment><pub-id pub-id-type="doi">10.48550/arXiv.2301.07573</pub-id></nlm-citation></ref><ref id="ref44"><label>44</label><nlm-citation citation-type="web"><article-title>CHEO-EHIL/pysdg-releases</article-title><source>GitHub</source><access-date>2026-06-01</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://github.com/CHEO-EHIL/pysdg-releases">https://github.com/CHEO-EHIL/pysdg-releases</ext-link></comment></nlm-citation></ref><ref id="ref45"><label>45</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Hu</surname><given-names>EJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Wallis</surname><given-names>P</given-names> </name><etal/></person-group><article-title>LoRA: low-rank adaptation of large language models</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 17, 2021</comment><pub-id pub-id-type="doi">10.48550/arXiv.2106.09685</pub-id></nlm-citation></ref><ref id="ref46"><label>46</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>Zhang</surname><given-names>K</given-names> </name><name name-style="western"><surname>Zeng</surname><given-names>S</given-names> </name><name name-style="western"><surname>Hua</surname><given-names>E</given-names> </name><etal/></person-group><article-title>UltraMedical: building specialized generalists in biomedicine</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 6, 2024</comment><pub-id pub-id-type="doi">10.48550/arXiv.2406.03949</pub-id></nlm-citation></ref><ref id="ref47"><label>47</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Ke</surname><given-names>G</given-names> </name><name name-style="western"><surname>Meng</surname><given-names>Q</given-names> </name><name name-style="western"><surname>Finley</surname><given-names>T</given-names> </name><etal/></person-group><article-title>LightGBM: a highly efficient gradient boosting decision tree</article-title><access-date>2026-05-21</access-date><conf-name>31st Conference on Neural Information Processing Systems (NIPS 2017)</conf-name><conf-date>Dec 4-9, 2017</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper_files/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf">https://proceedings.neurips.cc/paper_files/paper/2017/file/6449f44a102fde848669bdd9eb6b76fa-Paper.pdf</ext-link></comment></nlm-citation></ref><ref id="ref48"><label>48</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Rousset</surname><given-names>A</given-names> </name><name name-style="western"><surname>Dellamonica</surname><given-names>D</given-names> </name><name name-style="western"><surname>Menuet</surname><given-names>R</given-names> </name><etal/></person-group><article-title>Can machine learning bring cardiovascular risk assessment to the next level? A methodological study using FOURIER trial data</article-title><source>Eur Heart J Digit Health</source><year>2021</year><month>03</month><volume>3</volume><issue>1</issue><fpage>38</fpage><lpage>48</lpage><pub-id pub-id-type="doi">10.1093/ehjdh/ztab093</pub-id><pub-id pub-id-type="medline">36713994</pub-id></nlm-citation></ref><ref id="ref49"><label>49</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Weng</surname><given-names>SF</given-names> </name><name name-style="western"><surname>Reps</surname><given-names>J</given-names> </name><name name-style="western"><surname>Kai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Garibaldi</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Qureshi</surname><given-names>N</given-names> </name></person-group><article-title>Can machine-learning improve cardiovascular risk prediction using routine clinical data?</article-title><source>PLOS ONE</source><year>2017</year><volume>12</volume><issue>4</issue><fpage>e0174944</fpage><pub-id pub-id-type="doi">10.1371/journal.pone.0174944</pub-id><pub-id pub-id-type="medline">28376093</pub-id></nlm-citation></ref><ref id="ref50"><label>50</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Akyea</surname><given-names>RK</given-names> </name><name name-style="western"><surname>Qureshi</surname><given-names>N</given-names> </name><name name-style="western"><surname>Kai</surname><given-names>J</given-names> </name><name name-style="western"><surname>Weng</surname><given-names>SF</given-names> </name></person-group><article-title>Performance and clinical utility of supervised machine-learning approaches in detecting familial hypercholesterolaemia in primary care</article-title><source>NPJ Digit Med</source><year>2020</year><volume>3</volume><fpage>142</fpage><pub-id pub-id-type="doi">10.1038/s41746-020-00349-5</pub-id><pub-id pub-id-type="medline">33145438</pub-id></nlm-citation></ref><ref id="ref51"><label>51</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Desai</surname><given-names>RJ</given-names> </name><name name-style="western"><surname>Wang</surname><given-names>SV</given-names> </name><name name-style="western"><surname>Vaduganathan</surname><given-names>M</given-names> </name><name name-style="western"><surname>Evers</surname><given-names>T</given-names> </name><name name-style="western"><surname>Schneeweiss</surname><given-names>S</given-names> </name></person-group><article-title>Comparison of machine learning methods with traditional models for use of administrative claims with electronic medical records to predict heart failure outcomes</article-title><source>JAMA Netw Open</source><year>2020</year><month>01</month><day>3</day><volume>3</volume><issue>1</issue><fpage>e1918962</fpage><pub-id pub-id-type="doi">10.1001/jamanetworkopen.2019.18962</pub-id><pub-id pub-id-type="medline">31922560</pub-id></nlm-citation></ref><ref id="ref52"><label>52</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Li</surname><given-names>Y ming</given-names> </name><name name-style="western"><surname>Jiang</surname><given-names>L cheng</given-names> </name><name name-style="western"><surname>He</surname><given-names>J jing</given-names> </name><name name-style="western"><surname>Jia</surname><given-names>K yu</given-names> </name><name name-style="western"><surname>Peng</surname><given-names>Y</given-names> </name><name name-style="western"><surname>Chen</surname><given-names>M</given-names> </name></person-group><article-title>Machine learning to predict the 1-year mortality rate after acute anterior myocardial infarction in Chinese patients</article-title><source>Ther Clin Risk Manag</source><year>2020</year><volume>16</volume><fpage>1</fpage><lpage>6</lpage><pub-id pub-id-type="doi">10.2147/TCRM.S236498</pub-id></nlm-citation></ref><ref id="ref53"><label>53</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Shwartz-Ziv</surname><given-names>R</given-names> </name><name name-style="western"><surname>Armon</surname><given-names>A</given-names> </name></person-group><article-title>Tabular data: deep learning is not all you need</article-title><source>Inf Fusion</source><year>2022</year><month>05</month><volume>81</volume><fpage>84</fpage><lpage>90</lpage><pub-id pub-id-type="doi">10.1016/j.inffus.2021.11.011</pub-id></nlm-citation></ref><ref id="ref54"><label>54</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Grinsztajn</surname><given-names>L</given-names> </name><name name-style="western"><surname>Oyallon</surname><given-names>E</given-names> </name><name name-style="western"><surname>Varoquaux</surname><given-names>G</given-names> </name></person-group><article-title>Why do tree-based models still outperform deep learning on typical tabular data?</article-title><conf-name>36th Conference on Neural Information Processing Systems (NeurIPS 2022) Track on Datasets and Benchmarks</conf-name><conf-date>Nov 28 to Dec 9, 2022</conf-date><pub-id pub-id-type="doi">10.52202/068431-0037</pub-id></nlm-citation></ref><ref id="ref55"><label>55</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Snoek</surname><given-names>J</given-names> </name><name name-style="western"><surname>Larochelle</surname><given-names>H</given-names> </name><name name-style="western"><surname>Adams</surname><given-names>RP</given-names> </name></person-group><article-title>Practical Bayesian optimization of machine learning algorithms</article-title><access-date>2026-05-21</access-date><conf-name>Advances in Neural Information Processing Systems 25 (NIPS 2012)</conf-name><conf-date>Dec 3-8, 2012</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://proceedings.neurips.cc/paper/2012/hash/05311655a15b75fab86956663e1819cd-Abstract.html">https://proceedings.neurips.cc/paper/2012/hash/05311655a15b75fab86956663e1819cd-Abstract.html</ext-link></comment></nlm-citation></ref><ref id="ref56"><label>56</label><nlm-citation citation-type="book"><person-group person-group-type="author"><name name-style="western"><surname>Bartz</surname><given-names>E</given-names> </name><name name-style="western"><surname>Bartz-Beielstein</surname><given-names>T</given-names> </name><name name-style="western"><surname>Zaefferer</surname><given-names>M</given-names> </name><name name-style="western"><surname>Mersmann</surname><given-names>O</given-names> </name></person-group><source>Hyperparameter Tuning for Machine and Deep Learning with R: A Practical Guide</source><year>2023</year><publisher-name>Springer</publisher-name><pub-id pub-id-type="doi">10.1007/978-981-19-5170-1</pub-id></nlm-citation></ref><ref id="ref57"><label>57</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Bischl</surname><given-names>B</given-names> </name><name name-style="western"><surname>Binder</surname><given-names>M</given-names> </name><name name-style="western"><surname>Lang</surname><given-names>M</given-names> </name><etal/></person-group><article-title>Hyperparameter optimization: foundations, algorithms, best practices, and open challenges</article-title><source>WIREs Data Min Knowl Discov</source><year>2023</year><volume>13</volume><fpage>e1484</fpage><pub-id pub-id-type="doi">10.1002/widm.1484</pub-id></nlm-citation></ref><ref id="ref58"><label>58</label><nlm-citation citation-type="confproc"><person-group person-group-type="author"><name name-style="western"><surname>Binder</surname><given-names>M</given-names> </name><name name-style="western"><surname>Pfisterer</surname><given-names>F</given-names> </name><name name-style="western"><surname>Bischl</surname><given-names>B</given-names> </name></person-group><article-title>Collecting empirical data about hyperparameters for data driven AutoML</article-title><access-date>2026-05-21</access-date><conf-name>7th ICML Workshop on Automated Machine Learning</conf-name><conf-date>Jul 17-18, 2020</conf-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.automl.org/wp-content/uploads/2020/07/AutoML_2020_paper_63.pdf">https://www.automl.org/wp-content/uploads/2020/07/AutoML_2020_paper_63.pdf</ext-link></comment></nlm-citation></ref><ref id="ref59"><label>59</label><nlm-citation citation-type="other"><person-group person-group-type="author"><name name-style="western"><surname>K&#x00FC;hn</surname><given-names>D</given-names> </name><name name-style="western"><surname>Probst</surname><given-names>P</given-names> </name><name name-style="western"><surname>Thomas</surname><given-names>J</given-names> </name><name name-style="western"><surname>Bischl</surname><given-names>B</given-names> </name></person-group><article-title>Automatic exploration of machine learning experiments on OpenML</article-title><source>arXiv</source><comment>Preprint posted online on  Jun 28, 2018</comment><pub-id pub-id-type="doi">10.48550/arXiv.1806.10961</pub-id></nlm-citation></ref><ref id="ref60"><label>60</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>JM</given-names> </name><name name-style="western"><surname>Khoshgoftaar</surname><given-names>TM</given-names> </name></person-group><article-title>Medical provider embeddings for healthcare fraud detection</article-title><source>SN Comput Sci</source><year>2021</year><month>07</month><volume>2</volume><issue>4</issue><fpage>276</fpage><pub-id pub-id-type="doi">10.1007/s42979-021-00656-y</pub-id></nlm-citation></ref><ref id="ref61"><label>61</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Wang</surname><given-names>W</given-names> </name><name name-style="western"><surname>Pai</surname><given-names>TW</given-names> </name></person-group><article-title>Enhancing small tabular clinical trial dataset through hybrid data augmentation: combining SMOTE and WCGAN-GP</article-title><source>Data (Basel)</source><year>2023</year><volume>8</volume><issue>9</issue><fpage>135</fpage><pub-id pub-id-type="doi">10.3390/data8090135</pub-id></nlm-citation></ref><ref id="ref62"><label>62</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Papadopoulos</surname><given-names>D</given-names> </name><name name-style="western"><surname>Karalis</surname><given-names>VD</given-names> </name></person-group><article-title>Variational autoencoders for data augmentation in clinical studies</article-title><source>Appl Sci (Basel)</source><year>2023</year><volume>13</volume><issue>15</issue><fpage>8793</fpage><pub-id pub-id-type="doi">10.3390/app13158793</pub-id></nlm-citation></ref><ref id="ref63"><label>63</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Pilgram</surname><given-names>L</given-names> </name><name name-style="western"><surname>El Kababji</surname><given-names>S</given-names> </name><name name-style="western"><surname>Liu</surname><given-names>D</given-names> </name><name name-style="western"><surname>El Emam</surname><given-names>K</given-names> </name></person-group><article-title>Magnitude and impact of hallucinations in tabular synthetic health data on prognostic machine learning models: validation study</article-title><source>J Med Internet Res</source><year>2025</year><month>08</month><day>18</day><volume>27</volume><fpage>e77893</fpage><pub-id pub-id-type="doi">10.2196/77893</pub-id><pub-id pub-id-type="medline">40825542</pub-id></nlm-citation></ref><ref id="ref64"><label>64</label><nlm-citation citation-type="web"><article-title>Data</article-title><source>BORN Ontario</source><access-date>2026-06-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.bornontario.ca/data/">https://www.bornontario.ca/data/</ext-link></comment></nlm-citation></ref><ref id="ref65"><label>65</label><nlm-citation citation-type="web"><article-title>BSA inpatient claims PUF</article-title><source>CMS.gov</source><access-date>2026-06-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.cms.gov/data-research/statistics-trends-and-reports/basic-stand-alone-medicare-claims-public-use-files/bsa-inpatient-claims-puf">https://www.cms.gov/data-research/statistics-trends-and-reports/basic-stand-alone-medicare-claims-public-use-files/bsa-inpatient-claims-puf</ext-link></comment></nlm-citation></ref><ref id="ref66"><label>66</label><nlm-citation citation-type="web"><article-title>Healthcare cost and utilization project (HCUP)</article-title><source>Agency for Healthcare Research and Quality</source><year>2005</year><access-date>2026-05-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.ahrq.gov/data/hcup/index.html">https://www.ahrq.gov/data/hcup/index.html</ext-link></comment></nlm-citation></ref><ref id="ref67"><label>67</label><nlm-citation citation-type="web"><article-title>How to access Canadian community health survey (CCHS) data</article-title><source>Statistics Canada</source><access-date>2026-06-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www150.statcan.gc.ca/n1/pub/82-620-m/2005001/4144189-eng.htm">https://www150.statcan.gc.ca/n1/pub/82-620-m/2005001/4144189-eng.htm</ext-link></comment></nlm-citation></ref><ref id="ref68"><label>68</label><nlm-citation citation-type="web"><article-title>COVID-19 resources Canada</article-title><source>ArcGIS Hub</source><access-date>2026-06-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://resources-covid19canada.hub.arcgis.com">https://resources-covid19canada.hub.arcgis.com</ext-link></comment></nlm-citation></ref><ref id="ref69"><label>69</label><nlm-citation citation-type="web"><article-title>FDA adverse event monitoring system (AEMS) latest quarterly data files</article-title><source>US Food And Drug Administration</source><access-date>2026-06-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.fda.gov/drugs/fda-adverse-event-monitoring-system-aems/fda-adverse-event-monitoring-system-aems-latest-quarterly-data-files">https://www.fda.gov/drugs/fda-adverse-event-monitoring-system-aems/fda-adverse-event-monitoring-system-aems-latest-quarterly-data-files</ext-link></comment></nlm-citation></ref><ref id="ref70"><label>70</label><nlm-citation citation-type="web"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>A</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>T</given-names> </name><name name-style="western"><surname>Mark</surname><given-names>R</given-names> </name></person-group><article-title>MIMIC-III clinical database</article-title><source>PhysioNet</source><year>2016</year><access-date>2026-05-21</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://physionet.org/content/mimiciii/1.4/">https://physionet.org/content/mimiciii/1.4/</ext-link></comment></nlm-citation></ref><ref id="ref71"><label>71</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Johnson</surname><given-names>AEW</given-names> </name><name name-style="western"><surname>Pollard</surname><given-names>TJ</given-names> </name><name name-style="western"><surname>Shen</surname><given-names>L</given-names> </name><etal/></person-group><article-title>MIMIC-III, a freely accessible critical care database</article-title><source>Sci Data</source><year>2016</year><month>05</month><day>24</day><volume>3</volume><fpage>160035</fpage><pub-id pub-id-type="doi">10.1038/sdata.2016.35</pub-id><pub-id pub-id-type="medline">27219127</pub-id></nlm-citation></ref><ref id="ref72"><label>72</label><nlm-citation citation-type="journal"><person-group person-group-type="author"><name name-style="western"><surname>Goldberger</surname><given-names>AL</given-names> </name><name name-style="western"><surname>Amaral</surname><given-names>LA</given-names> </name><name name-style="western"><surname>Glass</surname><given-names>L</given-names> </name><etal/></person-group><article-title>PhysioBank, PhysioToolkit, and PhysioNet: components of a new research resource for complex physiologic signals</article-title><source>Circulation</source><year>2000</year><month>06</month><day>13</day><volume>101</volume><issue>23</issue><fpage>E215</fpage><lpage>20</lpage><pub-id pub-id-type="doi">10.1161/01.cir.101.23.e215</pub-id><pub-id pub-id-type="medline">10851218</pub-id></nlm-citation></ref><ref id="ref73"><label>73</label><nlm-citation citation-type="web"><article-title>Download dataset</article-title><source>COVID-19</source><access-date>2026-06-08</access-date><comment><ext-link ext-link-type="uri" xlink:href="https://www.covid19survivalcalculator.com/en/download">https://www.covid19survivalcalculator.com/en/download</ext-link></comment></nlm-citation></ref><ref id="ref74"><label>74</label><nlm-citation citation-type="report"><article-title>Texas hospital inpatient discharge public use data file</article-title><year>2025</year><access-date>2026-05-21</access-date><publisher-name>Texas Department of State Health Services (DSHS)</publisher-name><comment><ext-link ext-link-type="uri" xlink:href="https://www.dshs.texas.gov/sites/default/files/thcic/hospitals/inpatientdatadictionary1q2025.pdf">https://www.dshs.texas.gov/sites/default/files/thcic/hospitals/inpatientdatadictionary1q2025.pdf</ext-link></comment></nlm-citation></ref></ref-list><app-group><supplementary-material id="app1"><label>Multimedia Appendix 1</label><p>Additional methodological details and results.</p><media xlink:href="jmir_v28i1e88678_app1.pdf" xlink:title="PDF File, 781 KB"/></supplementary-material></app-group></back></article>